diff --git a/repository/job.go b/repository/job.go index d6866ca..8c2d339 100644 --- a/repository/job.go +++ b/repository/job.go @@ -11,6 +11,7 @@ import ( "github.com/ClusterCockpit/cc-backend/auth" "github.com/ClusterCockpit/cc-backend/graph/model" + "github.com/ClusterCockpit/cc-backend/log" "github.com/ClusterCockpit/cc-backend/schema" sq "github.com/Masterminds/squirrel" "github.com/iamlouk/lrucache" @@ -370,3 +371,26 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in return subclusters, nil } + +func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { + res, err := sq.Update("job"). + Set("monitoring_status", schema.MonitoringStatusArchivingFailed). + Set("duration", 0). + Set("job_state", schema.JobStateFailed). + Where("job.job_state = 'running'"). + Where(fmt.Sprintf("(%d - job.start_time) > (job.walltime + %d)", time.Now().Unix(), seconds)). + RunWith(r.DB).Exec() + if err != nil { + return err + } + + rowsAffected, err := res.RowsAffected() + if err != nil { + return err + } + + if rowsAffected > 0 { + log.Warnf("%d jobs have been marked as failed due to running too long", rowsAffected) + } + return nil +} diff --git a/server.go b/server.go index 28f210a..c9a5ae2 100644 --- a/server.go +++ b/server.go @@ -13,6 +13,7 @@ import ( "net/url" "os" "os/signal" + "runtime" "runtime/debug" "strings" "sync" @@ -92,6 +93,9 @@ type ProgramConfig struct { // Where to store MachineState files MachineStateDir string `json:"machine-state-dir"` + + // If not zero, automatically mark jobs as stopped running X seconds longer than theire walltime. + StopJobsExceedingWalltime int `json:"stop-jobs-exceeding-walltime"` } var programConfig ProgramConfig = ProgramConfig{ @@ -123,6 +127,7 @@ var programConfig ProgramConfig = ProgramConfig{ "plot_view_showStatTable": true, "system_view_selectedMetric": "cpu_load", }, + StopJobsExceedingWalltime: 3600, } func main() { @@ -477,6 +482,18 @@ func main() { api.OngoingArchivings.Wait() }() + if programConfig.StopJobsExceedingWalltime != 0 { + go func() { + for range time.Tick(1 * time.Hour) { + err := jobRepo.StopJobsExceedingWalltimeBy(programConfig.StopJobsExceedingWalltime) + if err != nil { + log.Errorf("error while looking for jobs exceeding theire walltime: %s", err.Error()) + } + runtime.GC() + } + }() + } + if os.Getenv("GOGC") == "" { debug.SetGCPercent(25) }