mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-26 05:19:05 +01:00
Automatically mark jobs as failed if running too long
This commit is contained in:
parent
b47d175ab2
commit
cbc8227550
@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/auth"
|
"github.com/ClusterCockpit/cc-backend/auth"
|
||||||
"github.com/ClusterCockpit/cc-backend/graph/model"
|
"github.com/ClusterCockpit/cc-backend/graph/model"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/log"
|
||||||
"github.com/ClusterCockpit/cc-backend/schema"
|
"github.com/ClusterCockpit/cc-backend/schema"
|
||||||
sq "github.com/Masterminds/squirrel"
|
sq "github.com/Masterminds/squirrel"
|
||||||
"github.com/iamlouk/lrucache"
|
"github.com/iamlouk/lrucache"
|
||||||
@ -370,3 +371,26 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
|
|||||||
|
|
||||||
return subclusters, nil
|
return subclusters, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
||||||
|
res, err := sq.Update("job").
|
||||||
|
Set("monitoring_status", schema.MonitoringStatusArchivingFailed).
|
||||||
|
Set("duration", 0).
|
||||||
|
Set("job_state", schema.JobStateFailed).
|
||||||
|
Where("job.job_state = 'running'").
|
||||||
|
Where(fmt.Sprintf("(%d - job.start_time) > (job.walltime + %d)", time.Now().Unix(), seconds)).
|
||||||
|
RunWith(r.DB).Exec()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
rowsAffected, err := res.RowsAffected()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if rowsAffected > 0 {
|
||||||
|
log.Warnf("%d jobs have been marked as failed due to running too long", rowsAffected)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
17
server.go
17
server.go
@ -13,6 +13,7 @@ import (
|
|||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
|
"runtime"
|
||||||
"runtime/debug"
|
"runtime/debug"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@ -92,6 +93,9 @@ type ProgramConfig struct {
|
|||||||
|
|
||||||
// Where to store MachineState files
|
// Where to store MachineState files
|
||||||
MachineStateDir string `json:"machine-state-dir"`
|
MachineStateDir string `json:"machine-state-dir"`
|
||||||
|
|
||||||
|
// If not zero, automatically mark jobs as stopped running X seconds longer than theire walltime.
|
||||||
|
StopJobsExceedingWalltime int `json:"stop-jobs-exceeding-walltime"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var programConfig ProgramConfig = ProgramConfig{
|
var programConfig ProgramConfig = ProgramConfig{
|
||||||
@ -123,6 +127,7 @@ var programConfig ProgramConfig = ProgramConfig{
|
|||||||
"plot_view_showStatTable": true,
|
"plot_view_showStatTable": true,
|
||||||
"system_view_selectedMetric": "cpu_load",
|
"system_view_selectedMetric": "cpu_load",
|
||||||
},
|
},
|
||||||
|
StopJobsExceedingWalltime: 3600,
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@ -477,6 +482,18 @@ func main() {
|
|||||||
api.OngoingArchivings.Wait()
|
api.OngoingArchivings.Wait()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if programConfig.StopJobsExceedingWalltime != 0 {
|
||||||
|
go func() {
|
||||||
|
for range time.Tick(1 * time.Hour) {
|
||||||
|
err := jobRepo.StopJobsExceedingWalltimeBy(programConfig.StopJobsExceedingWalltime)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("error while looking for jobs exceeding theire walltime: %s", err.Error())
|
||||||
|
}
|
||||||
|
runtime.GC()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
if os.Getenv("GOGC") == "" {
|
if os.Getenv("GOGC") == "" {
|
||||||
debug.SetGCPercent(25)
|
debug.SetGCPercent(25)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user