mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-07-22 20:41:40 +02:00
Refactor taggers. Refine Job Hooks. Start job classifier
This commit is contained in:
@@ -1126,8 +1126,6 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
|
||||
return
|
||||
}
|
||||
|
||||
repository.CallJobStopHooks(job)
|
||||
|
||||
// Trigger async archiving
|
||||
archiver.TriggerArchiving(job)
|
||||
}
|
||||
|
@@ -72,7 +72,14 @@ func archivingWorker() {
|
||||
}
|
||||
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
log.Printf("archiving job (dbid: %d) successful", job.ID)
|
||||
|
||||
id := job.ID
|
||||
jobMeta.ID = &id
|
||||
|
||||
repository.CallJobStopHooks(jobMeta)
|
||||
archivePending.Done()
|
||||
default:
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -12,7 +12,7 @@ import (
|
||||
|
||||
type JobHook interface {
|
||||
JobStartCallback(job *schema.Job)
|
||||
JobStopCallback(job *schema.Job)
|
||||
JobStopCallback(job *schema.JobMeta)
|
||||
}
|
||||
|
||||
var (
|
||||
@@ -44,7 +44,7 @@ func CallJobStartHooks(jobs []*schema.Job) {
|
||||
}
|
||||
}
|
||||
|
||||
func CallJobStopHooks(job *schema.Job) {
|
||||
func CallJobStopHooks(job *schema.JobMeta) {
|
||||
if hooks == nil {
|
||||
return
|
||||
}
|
||||
|
121
internal/tagger/classifyJob.go
Normal file
121
internal/tagger/classifyJob.go
Normal file
@@ -0,0 +1,121 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package tagger
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"embed"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/util"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/expr-lang/expr"
|
||||
"github.com/expr-lang/expr/vm"
|
||||
)
|
||||
|
||||
//go:embed jobclasses/*
|
||||
var jobclassFiles embed.FS
|
||||
|
||||
type ruleInfo struct {
|
||||
tag string
|
||||
rule *vm.Program
|
||||
}
|
||||
|
||||
type JobClassTagger struct {
|
||||
rules map[string]ruleInfo
|
||||
tagType string
|
||||
cfgPath string
|
||||
}
|
||||
|
||||
func (t *JobClassTagger) compileRule(f fs.File, fns string) {
|
||||
buf := new(bytes.Buffer)
|
||||
_, err := buf.ReadFrom(f)
|
||||
if err != nil {
|
||||
log.Errorf("error reading rule file %s: %#v", fns, err)
|
||||
}
|
||||
prg, err := expr.Compile(buf.String(), expr.AsBool())
|
||||
if err != nil {
|
||||
log.Errorf("error compiling rule %s: %#v", fns, err)
|
||||
}
|
||||
ri := ruleInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), rule: prg}
|
||||
|
||||
delete(t.rules, ri.tag)
|
||||
t.rules[ri.tag] = ri
|
||||
}
|
||||
|
||||
func (t *JobClassTagger) EventMatch(s string) bool {
|
||||
return strings.Contains(s, "jobclasses")
|
||||
}
|
||||
|
||||
// FIXME: Only process the file that caused the event
|
||||
func (t *JobClassTagger) EventCallback() {
|
||||
files, err := os.ReadDir(t.cfgPath)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
for _, fn := range files {
|
||||
fns := fn.Name()
|
||||
log.Debugf("Process: %s", fns)
|
||||
f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns))
|
||||
if err != nil {
|
||||
log.Errorf("error opening app file %s: %#v", fns, err)
|
||||
}
|
||||
t.compileRule(f, fns)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *JobClassTagger) Register() error {
|
||||
t.cfgPath = "./var/tagger/jobclasses"
|
||||
t.tagType = "jobClass"
|
||||
|
||||
files, err := appFiles.ReadDir("jobclasses")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading app folder: %#v", err)
|
||||
}
|
||||
t.rules = make(map[string]ruleInfo, 0)
|
||||
for _, fn := range files {
|
||||
fns := fn.Name()
|
||||
log.Debugf("Process: %s", fns)
|
||||
f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error opening app file %s: %#v", fns, err)
|
||||
}
|
||||
defer f.Close()
|
||||
t.compileRule(f, fns)
|
||||
}
|
||||
|
||||
if util.CheckFileExists(t.cfgPath) {
|
||||
t.EventCallback()
|
||||
log.Infof("Setup file watch for %s", t.cfgPath)
|
||||
util.AddListener(t.cfgPath, t)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *JobClassTagger) Match(job *schema.JobMeta) {
|
||||
r := repository.GetJobRepository()
|
||||
|
||||
for _, ri := range t.rules {
|
||||
tag := ri.tag
|
||||
output, err := expr.Run(ri.rule, job)
|
||||
if err != nil {
|
||||
log.Errorf("error running rule %s: %#v", tag, err)
|
||||
}
|
||||
if output.(bool) {
|
||||
id := job.ID
|
||||
if !r.HasTag(*id, t.tagType, tag) {
|
||||
r.AddTagOrCreateDirect(*id, t.tagType, tag)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -19,11 +19,6 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
const (
|
||||
tagType = "app"
|
||||
appPath = "./var/tagger/apps"
|
||||
)
|
||||
|
||||
//go:embed apps/*
|
||||
var appFiles embed.FS
|
||||
|
||||
@@ -33,7 +28,9 @@ type appInfo struct {
|
||||
}
|
||||
|
||||
type AppTagger struct {
|
||||
apps map[string]appInfo
|
||||
apps map[string]appInfo
|
||||
tagType string
|
||||
cfgPath string
|
||||
}
|
||||
|
||||
func (t *AppTagger) scanApp(f fs.File, fns string) {
|
||||
@@ -53,7 +50,7 @@ func (t *AppTagger) EventMatch(s string) bool {
|
||||
|
||||
// FIXME: Only process the file that caused the event
|
||||
func (t *AppTagger) EventCallback() {
|
||||
files, err := os.ReadDir(appPath)
|
||||
files, err := os.ReadDir(t.cfgPath)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
@@ -61,7 +58,7 @@ func (t *AppTagger) EventCallback() {
|
||||
for _, fn := range files {
|
||||
fns := fn.Name()
|
||||
log.Debugf("Process: %s", fns)
|
||||
f, err := os.Open(fmt.Sprintf("%s/%s", appPath, fns))
|
||||
f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns))
|
||||
if err != nil {
|
||||
log.Errorf("error opening app file %s: %#v", fns, err)
|
||||
}
|
||||
@@ -70,6 +67,9 @@ func (t *AppTagger) EventCallback() {
|
||||
}
|
||||
|
||||
func (t *AppTagger) Register() error {
|
||||
t.cfgPath = "./var/tagger/apps"
|
||||
t.tagType = "app"
|
||||
|
||||
files, err := appFiles.ReadDir("apps")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading app folder: %#v", err)
|
||||
@@ -79,28 +79,25 @@ func (t *AppTagger) Register() error {
|
||||
fns := fn.Name()
|
||||
log.Debugf("Process: %s", fns)
|
||||
f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns))
|
||||
defer f.Close()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error opening app file %s: %#v", fns, err)
|
||||
}
|
||||
t.scanApp(f, fns)
|
||||
}
|
||||
|
||||
if util.CheckFileExists(appPath) {
|
||||
if util.CheckFileExists(t.cfgPath) {
|
||||
t.EventCallback()
|
||||
log.Infof("Setup file watch for %s", appPath)
|
||||
util.AddListener(appPath, t)
|
||||
log.Infof("Setup file watch for %s", t.cfgPath)
|
||||
util.AddListener(t.cfgPath, t)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *AppTagger) Match(job *schema.Job) {
|
||||
func (t *AppTagger) Match(job *schema.JobMeta) {
|
||||
r := repository.GetJobRepository()
|
||||
meta, err := r.FetchMetadata(job)
|
||||
if err != nil {
|
||||
log.Error("cannot fetch meta data")
|
||||
}
|
||||
jobscript, ok := meta["jobScript"]
|
||||
jobscript, ok := job.MetaData["jobScript"]
|
||||
if ok {
|
||||
id := job.ID
|
||||
|
||||
@@ -109,8 +106,8 @@ func (t *AppTagger) Match(job *schema.Job) {
|
||||
tag := a.tag
|
||||
for _, s := range a.strings {
|
||||
if strings.Contains(jobscript, s) {
|
||||
if !r.HasTag(id, tagType, tag) {
|
||||
r.AddTagOrCreateDirect(id, tagType, tag)
|
||||
if !r.HasTag(*id, t.tagType, tag) {
|
||||
r.AddTagOrCreateDirect(*id, t.tagType, tag)
|
||||
break out
|
||||
}
|
||||
}
|
||||
|
@@ -9,6 +9,7 @@ import (
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
func setup(tb testing.TB) *repository.JobRepository {
|
||||
@@ -51,7 +52,12 @@ func TestMatch(t *testing.T) {
|
||||
err = tagger.Register()
|
||||
noErr(t, err)
|
||||
|
||||
tagger.Match(job)
|
||||
jobMeta := &schema.JobMeta{
|
||||
ID: &job.ID,
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
}
|
||||
tagger.Match(jobMeta)
|
||||
|
||||
if !r.HasTag(5, "app", "vasp") {
|
||||
t.Errorf("missing tag vasp")
|
||||
|
38
internal/tagger/jobclasses/highload.json
Normal file
38
internal/tagger/jobclasses/highload.json
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"name": "Excessive CPU load",
|
||||
"tag": "excessiveload",
|
||||
"comment": "Assumptions: all nodes have the same number of cores.",
|
||||
"parameters": [
|
||||
"excessivecpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"cpu_load"
|
||||
],
|
||||
"requirements": [
|
||||
"job.exclusive == 1",
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
|
||||
],
|
||||
"terms": [
|
||||
{
|
||||
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"highload_nodes": "load_mean > load_threshold"
|
||||
},
|
||||
{
|
||||
"highload": "highload_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "load_mean / load_threshold"
|
||||
}
|
||||
],
|
||||
"output": "highload",
|
||||
"output_scalar": "load_perc",
|
||||
"template": "Job ({{ job.jobId }})\nThis job was detected as excessiveload because the mean cpu load {{ load_mean.array }} falls above the threshold {{ load_threshold }}."
|
||||
}
|
40
internal/tagger/jobclasses/highmem.json
Normal file
40
internal/tagger/jobclasses/highmem.json
Normal file
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"name": "High memory usage",
|
||||
"tag": "high_memory_load",
|
||||
"parameters": [
|
||||
"high_memory_load_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"mem_used"
|
||||
],
|
||||
"requirements": [
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds",
|
||||
"hasattr(job, \"allocated_memory\")"
|
||||
],
|
||||
"terms": [
|
||||
{
|
||||
"memory_alloc": "job.allocated_memory"
|
||||
},
|
||||
{
|
||||
"memory_used": "mem_used.max('time')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "memory_alloc * high_memory_load_threshold_factor"
|
||||
},
|
||||
{
|
||||
"high_mem_nodes": "memory_used > load_threshold"
|
||||
},
|
||||
{
|
||||
"high_mem": "high_mem_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "memory_used / (memory_alloc * high_memory_load_threshold_factor)"
|
||||
}
|
||||
],
|
||||
"output": "high_mem",
|
||||
"output_scalar": "load_perc",
|
||||
"template": "Job ({{ job.jobId }})\nThis job was detected as high_memory_load because the memory usage {{ high_mem_nodes.array }} falls above the threshold {{ load_threshold }}."
|
||||
}
|
36
internal/tagger/jobclasses/lowgpuload.json
Normal file
36
internal/tagger/jobclasses/lowgpuload.json
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"name": "Low GPU load",
|
||||
"tag": "lowgpuload",
|
||||
"parameters": [
|
||||
"lowgpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"nv_util"
|
||||
],
|
||||
"requirements": [
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
|
||||
],
|
||||
"terms": [
|
||||
{
|
||||
"load_mean": "nv_util.mean('all')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "job.numAcc * lowgpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"lowload_nodes": "load_mean < load_threshold"
|
||||
},
|
||||
{
|
||||
"lowload": "lowload_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "1.0 - (load_mean / load_threshold)"
|
||||
}
|
||||
],
|
||||
"output": "lowload",
|
||||
"output_scalar": "load_perc",
|
||||
"template": "Job ({{ job.jobId }})\nThis job was detected as lowgpuload because the mean gpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
|
||||
}
|
38
internal/tagger/jobclasses/lowload.json
Normal file
38
internal/tagger/jobclasses/lowload.json
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"name": "Low CPU load",
|
||||
"tag": "lowload",
|
||||
"parameters": [
|
||||
"lowcpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"cpu_load"
|
||||
],
|
||||
"requirements": [
|
||||
"job.exclusive == 1",
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
|
||||
],
|
||||
"tagRule": [
|
||||
{
|
||||
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples:].mean('all')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "job.numHwthreads * lowcpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"lowload_nodes": "load_mean < load_threshold"
|
||||
},
|
||||
{
|
||||
"lowload": "lowload_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "1.0 - (load_mean / load_threshold)"
|
||||
}
|
||||
],
|
||||
"valueRule": [],
|
||||
"output": "lowload",
|
||||
"output_scalar": "load_perc",
|
||||
"hint": "Job ({{ job.jobId }})\nThis job was detected as lowload because the mean cpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
|
||||
}
|
@@ -13,7 +13,7 @@ import (
|
||||
|
||||
type Tagger interface {
|
||||
Register() error
|
||||
Match(job *schema.Job)
|
||||
Match(job *schema.JobMeta)
|
||||
}
|
||||
|
||||
var (
|
||||
@@ -31,6 +31,8 @@ func Init() {
|
||||
jobTagger = &JobTagger{}
|
||||
jobTagger.startTaggers = make([]Tagger, 0)
|
||||
jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{})
|
||||
jobTagger.stopTaggers = make([]Tagger, 0)
|
||||
jobTagger.stopTaggers = append(jobTagger.startTaggers, &JobClassTagger{})
|
||||
|
||||
for _, tagger := range jobTagger.startTaggers {
|
||||
tagger.Register()
|
||||
@@ -46,5 +48,5 @@ func (jt *JobTagger) JobStartCallback(job *schema.Job) {
|
||||
}
|
||||
}
|
||||
|
||||
func (jt *JobTagger) JobStopCallback(job *schema.Job) {
|
||||
func (jt *JobTagger) JobStopCallback(job *schema.JobMeta) {
|
||||
}
|
||||
|
Reference in New Issue
Block a user