mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-07-23 12:51:40 +02:00
Refactor: Archive workers and Tasks
Work in progress
This commit is contained in:
@@ -162,7 +162,7 @@ func LoadData(job *schema.Job,
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
prepareJobData(job, jd, scopes)
|
||||
prepareJobData(jd, scopes)
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
@@ -266,7 +266,6 @@ func cacheKey(
|
||||
// statisticsSeries should be available so that a min/median/max Graph can be
|
||||
// used instead of a lot of single lines.
|
||||
func prepareJobData(
|
||||
job *schema.Job,
|
||||
jobData schema.JobData,
|
||||
scopes []schema.MetricScope,
|
||||
) {
|
||||
|
112
internal/repository/archiveWorker.go
Normal file
112
internal/repository/archiveWorker.go
Normal file
@@ -0,0 +1,112 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
// Archiving worker thread
|
||||
func (r *JobRepository) archivingWorker() {
|
||||
for {
|
||||
select {
|
||||
case job, ok := <-r.archiveChannel:
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
start := time.Now()
|
||||
// not using meta data, called to load JobMeta into Cache?
|
||||
// will fail if job meta not in repository
|
||||
if _, err := r.FetchMetadata(job); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
||||
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// metricdata.ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
|
||||
// TODO: Maybe use context with cancel/timeout here
|
||||
jobMeta, err := metricdata.ArchiveJob(job, context.Background())
|
||||
if err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
||||
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// Update the jobs database entry one last time:
|
||||
if err := r.MarkArchived(jobMeta, schema.MonitoringStatusArchivingSuccessful); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at marking archived step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
log.Printf("archiving job (dbid: %d) successful", job.ID)
|
||||
r.archivePending.Done()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) MarkArchived(
|
||||
jobMeta *schema.JobMeta,
|
||||
monitoringStatus int32,
|
||||
) error {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobMeta.JobID)
|
||||
|
||||
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
|
||||
if err != nil {
|
||||
log.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
footprint := make(map[string]float64)
|
||||
|
||||
for _, fp := range sc.Footprint {
|
||||
footprint[fp] = LoadJobStat(jobMeta, fp)
|
||||
}
|
||||
|
||||
var rawFootprint []byte
|
||||
|
||||
if rawFootprint, err = json.Marshal(footprint); err != nil {
|
||||
log.Warnf("Error while marshaling footprint for job, DB ID '%v'", jobMeta.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
stmt = stmt.Set("footprint", rawFootprint)
|
||||
|
||||
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
log.Warn("Error while marking job as archived")
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", job)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
}
|
||||
|
||||
// Trigger async archiving
|
||||
func (r *JobRepository) TriggerArchiving(job *schema.Job) {
|
||||
r.archivePending.Add(1)
|
||||
r.archiveChannel <- job
|
||||
}
|
||||
|
||||
// Wait for background thread to finish pending archiving operations
|
||||
func (r *JobRepository) WaitForArchiving() {
|
||||
// close channel and wait for worker to process remaining jobs
|
||||
r.archivePending.Wait()
|
||||
}
|
8
internal/repository/footprintWorker.go
Normal file
8
internal/repository/footprintWorker.go
Normal file
@@ -0,0 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
func (r *JobRepository) footprintWorker() {
|
||||
}
|
@@ -5,7 +5,6 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
@@ -15,7 +14,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
|
||||
@@ -278,101 +276,6 @@ func (r *JobRepository) DeleteJobById(id int64) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", job)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) MarkArchived(
|
||||
jobMeta *schema.JobMeta,
|
||||
monitoringStatus int32,
|
||||
) error {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobMeta.JobID)
|
||||
|
||||
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
|
||||
if err != nil {
|
||||
log.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
footprint := make(map[string]float64)
|
||||
|
||||
for _, fp := range sc.Footprint {
|
||||
footprint[fp] = LoadJobStat(jobMeta, fp)
|
||||
}
|
||||
|
||||
var rawFootprint []byte
|
||||
|
||||
if rawFootprint, err = json.Marshal(footprint); err != nil {
|
||||
log.Warnf("Error while marshaling footprint for job, DB ID '%v'", jobMeta.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
stmt = stmt.Set("footprint", rawFootprint)
|
||||
|
||||
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
log.Warn("Error while marking job as archived")
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Archiving worker thread
|
||||
func (r *JobRepository) archivingWorker() {
|
||||
for {
|
||||
select {
|
||||
case job, ok := <-r.archiveChannel:
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
start := time.Now()
|
||||
// not using meta data, called to load JobMeta into Cache?
|
||||
// will fail if job meta not in repository
|
||||
if _, err := r.FetchMetadata(job); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
||||
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// metricdata.ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
|
||||
// TODO: Maybe use context with cancel/timeout here
|
||||
jobMeta, err := metricdata.ArchiveJob(job, context.Background())
|
||||
if err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
||||
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// Update the jobs database entry one last time:
|
||||
if err := r.MarkArchived(jobMeta, schema.MonitoringStatusArchivingSuccessful); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at marking archived step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
log.Printf("archiving job (dbid: %d) successful", job.ID)
|
||||
r.archivePending.Done()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Trigger async archiving
|
||||
func (r *JobRepository) TriggerArchiving(job *schema.Job) {
|
||||
r.archivePending.Add(1)
|
||||
r.archiveChannel <- job
|
||||
}
|
||||
|
||||
// Wait for background thread to finish pending archiving operations
|
||||
func (r *JobRepository) WaitForArchiving() {
|
||||
// close channel and wait for worker to process remaining jobs
|
||||
r.archivePending.Wait()
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindUserOrProjectOrJobname(user *schema.User, searchterm string) (jobid string, username string, project string, jobname string) {
|
||||
if _, err := strconv.Atoi(searchterm); err == nil { // Return empty on successful conversion: parent method will redirect for integer jobId
|
||||
return searchterm, "", "", ""
|
||||
|
70
internal/taskManager/retentionService.go
Normal file
70
internal/taskManager/retentionService.go
Normal file
@@ -0,0 +1,70 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
func RegisterRetentionService(cfg schema.Retention) {
|
||||
switch cfg.Policy {
|
||||
case "delete":
|
||||
|
||||
log.Info("Register retention delete service")
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(04, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
startTime := time.Now().Unix() - int64(cfg.Age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().CleanUp(jobs)
|
||||
|
||||
if cfg.IncludeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime)
|
||||
if err != nil {
|
||||
log.Errorf("Error while deleting retention jobs from db: %s", err.Error())
|
||||
} else {
|
||||
log.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
log.Errorf("Error occured in db optimization: %s", err.Error())
|
||||
}
|
||||
}
|
||||
}))
|
||||
case "move":
|
||||
log.Info("Register retention move service")
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(04, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
startTime := time.Now().Unix() - int64(cfg.Age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().Move(jobs, cfg.Location)
|
||||
|
||||
if cfg.IncludeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime)
|
||||
if err != nil {
|
||||
log.Errorf("Error while deleting retention jobs from db: %v", err)
|
||||
} else {
|
||||
log.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
log.Errorf("Error occured in db optimization: %v", err)
|
||||
}
|
||||
}
|
||||
}))
|
||||
}
|
||||
}
|
27
internal/taskManager/stopJobsExceedTime.go
Normal file
27
internal/taskManager/stopJobsExceedTime.go
Normal file
@@ -0,0 +1,27 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
func RegisterStopJobsExceedTime() {
|
||||
log.Info("Register undead jobs service")
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(03, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
err := jobRepo.StopJobsExceedingWalltimeBy(config.Keys.StopJobsExceedingWalltime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for jobs exceeding their walltime: %s", err.Error())
|
||||
}
|
||||
runtime.GC()
|
||||
}))
|
||||
}
|
29
internal/taskManager/taskManager.go
Normal file
29
internal/taskManager/taskManager.go
Normal file
@@ -0,0 +1,29 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
var (
|
||||
s gocron.Scheduler
|
||||
jobRepo *repository.JobRepository
|
||||
)
|
||||
|
||||
func init() {
|
||||
var err error
|
||||
jobRepo = repository.GetJobRepository()
|
||||
s, err = gocron.NewScheduler()
|
||||
if err != nil {
|
||||
log.Fatalf("Error while creating gocron scheduler: %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func Shutdown() {
|
||||
s.Shutdown()
|
||||
}
|
Reference in New Issue
Block a user