Automatically mark jobs as failed if running too long

This commit is contained in:
Lou Knauer 2022-04-07 09:50:32 +02:00
parent b47d175ab2
commit cbc8227550
2 changed files with 41 additions and 0 deletions

View File

@ -11,6 +11,7 @@ import (
"github.com/ClusterCockpit/cc-backend/auth" "github.com/ClusterCockpit/cc-backend/auth"
"github.com/ClusterCockpit/cc-backend/graph/model" "github.com/ClusterCockpit/cc-backend/graph/model"
"github.com/ClusterCockpit/cc-backend/log"
"github.com/ClusterCockpit/cc-backend/schema" "github.com/ClusterCockpit/cc-backend/schema"
sq "github.com/Masterminds/squirrel" sq "github.com/Masterminds/squirrel"
"github.com/iamlouk/lrucache" "github.com/iamlouk/lrucache"
@ -370,3 +371,26 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
return subclusters, nil return subclusters, nil
} }
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
res, err := sq.Update("job").
Set("monitoring_status", schema.MonitoringStatusArchivingFailed).
Set("duration", 0).
Set("job_state", schema.JobStateFailed).
Where("job.job_state = 'running'").
Where(fmt.Sprintf("(%d - job.start_time) > (job.walltime + %d)", time.Now().Unix(), seconds)).
RunWith(r.DB).Exec()
if err != nil {
return err
}
rowsAffected, err := res.RowsAffected()
if err != nil {
return err
}
if rowsAffected > 0 {
log.Warnf("%d jobs have been marked as failed due to running too long", rowsAffected)
}
return nil
}

View File

@ -13,6 +13,7 @@ import (
"net/url" "net/url"
"os" "os"
"os/signal" "os/signal"
"runtime"
"runtime/debug" "runtime/debug"
"strings" "strings"
"sync" "sync"
@ -92,6 +93,9 @@ type ProgramConfig struct {
// Where to store MachineState files // Where to store MachineState files
MachineStateDir string `json:"machine-state-dir"` MachineStateDir string `json:"machine-state-dir"`
// If not zero, automatically mark jobs as stopped running X seconds longer than theire walltime.
StopJobsExceedingWalltime int `json:"stop-jobs-exceeding-walltime"`
} }
var programConfig ProgramConfig = ProgramConfig{ var programConfig ProgramConfig = ProgramConfig{
@ -123,6 +127,7 @@ var programConfig ProgramConfig = ProgramConfig{
"plot_view_showStatTable": true, "plot_view_showStatTable": true,
"system_view_selectedMetric": "cpu_load", "system_view_selectedMetric": "cpu_load",
}, },
StopJobsExceedingWalltime: 3600,
} }
func main() { func main() {
@ -477,6 +482,18 @@ func main() {
api.OngoingArchivings.Wait() api.OngoingArchivings.Wait()
}() }()
if programConfig.StopJobsExceedingWalltime != 0 {
go func() {
for range time.Tick(1 * time.Hour) {
err := jobRepo.StopJobsExceedingWalltimeBy(programConfig.StopJobsExceedingWalltime)
if err != nil {
log.Errorf("error while looking for jobs exceeding theire walltime: %s", err.Error())
}
runtime.GC()
}
}()
}
if os.Getenv("GOGC") == "" { if os.Getenv("GOGC") == "" {
debug.SetGCPercent(25) debug.SetGCPercent(25)
} }