From d839c536423ee651f52fa03f76695120ec159296 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 22 Aug 2023 10:56:32 +0200 Subject: [PATCH 01/30] Add initial structure --- internal/tagger/apps/gromacs.txt | 0 internal/tagger/apps/openfoam.txt | 0 internal/tagger/apps/vasp.txt | 0 internal/tagger/detectApp.go | 20 ++++++++++++++++++++ internal/tagger/tagger.go | 17 +++++++++++++++++ 5 files changed, 37 insertions(+) create mode 100644 internal/tagger/apps/gromacs.txt create mode 100644 internal/tagger/apps/openfoam.txt create mode 100644 internal/tagger/apps/vasp.txt create mode 100644 internal/tagger/detectApp.go create mode 100644 internal/tagger/tagger.go diff --git a/internal/tagger/apps/gromacs.txt b/internal/tagger/apps/gromacs.txt new file mode 100644 index 0000000..e69de29 diff --git a/internal/tagger/apps/openfoam.txt b/internal/tagger/apps/openfoam.txt new file mode 100644 index 0000000..e69de29 diff --git a/internal/tagger/apps/vasp.txt b/internal/tagger/apps/vasp.txt new file mode 100644 index 0000000..e69de29 diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go new file mode 100644 index 0000000..298151b --- /dev/null +++ b/internal/tagger/detectApp.go @@ -0,0 +1,20 @@ +// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +const tagType = "app" + +type appInfo struct { + tag string + strings []string +} +type AppTagger struct { + apps []appInfo +} + +func (t *AppTagger) Register() error { + + return nil +} diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go new file mode 100644 index 0000000..52a369b --- /dev/null +++ b/internal/tagger/tagger.go @@ -0,0 +1,17 @@ +// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import "github.com/ClusterCockpit/cc-backend/pkg/schema" + +type Tagger interface { + Register() error + Match(job *schema.Job) +} + +func Init() error { + + return nil +} From dc0d9fe038a0a940d1b00ceb24f259721e3a293c Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 27 Sep 2023 15:01:08 +0200 Subject: [PATCH 02/30] Add more tags to test db --- internal/repository/testdata/job.db | Bin 114688 -> 114688 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/internal/repository/testdata/job.db b/internal/repository/testdata/job.db index 4b00aa55f041b70f717177bc7baef9eb69d1a226..4685f7f85704574eab5cbd27a08f41eb71bf719d 100644 GIT binary patch delta 240 zcmZo@U~gz(-@xL)!Nk9ff&V7|w#|Y9tN4`_nOGUbMU4v*OEU5^Q*u&`42+C*4J>ty z%oPlctV}Je49zFs^w$RJy3fFWpZ^_D*JXZwJ{ECNMz|iP$?xLDxS07jGVnj;Kh3`p zsAe&LtOPR)Bd0M7vVq1{1_oA!h7wF58CDD#15-&x7Dh&6RwiUApern`Of0z=7#NuO oUo-H(=KsY16KKUjt@f&b=aL4ou9lW+QK1NrY5_}>BfPxvRlix=Z!5MW^7{{n>X y!D2G3oD2*MJe+Jml8uvv3rK-geC7v2po&NQ0&I-TKtWzcHYSLgriNt=j1B+{bsGKv From ba7cc9168e74a925c06ac108355b221d5fdf0e88 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 28 Sep 2023 10:20:20 +0200 Subject: [PATCH 03/30] feat: add automatic application detection and tagging --- internal/repository/job_test.go | 30 +++++++++------ internal/repository/tags.go | 13 +++++++ internal/tagger/apps/gromacs.txt | 3 ++ internal/tagger/apps/openfoam.txt | 1 + internal/tagger/apps/vasp.txt | 2 + internal/tagger/detectApp.go | 64 +++++++++++++++++++++++++++++++ internal/tagger/detectApp_test.go | 59 ++++++++++++++++++++++++++++ 7 files changed, 161 insertions(+), 11 deletions(-) create mode 100644 internal/tagger/detectApp_test.go diff --git a/internal/repository/job_test.go b/internal/repository/job_test.go index c3f76a7..986365c 100644 --- a/internal/repository/job_test.go +++ b/internal/repository/job_test.go @@ -16,9 +16,7 @@ func TestFind(t *testing.T) { jobId, cluster, startTime := int64(398998), "fritz", int64(1675957496) job, err := r.Find(&jobId, &cluster, &startTime) - if err != nil { - t.Fatal(err) - } + noErr(t, err) // fmt.Printf("%+v", job) @@ -31,9 +29,7 @@ func TestFindById(t *testing.T) { r := setup(t) job, err := r.FindById(5) - if err != nil { - t.Fatal(err) - } + noErr(t, err) // fmt.Printf("%+v", job) @@ -46,14 +42,26 @@ func TestGetTags(t *testing.T) { r := setup(t) tags, counts, err := r.CountTags(nil) - if err != nil { - t.Fatal(err) - } + noErr(t, err) fmt.Printf("TAGS %+v \n", tags) // fmt.Printf("COUNTS %+v \n", counts) - if counts["bandwidth"] != 0 { - t.Errorf("wrong tag count \ngot: %d \nwant: 0", counts["bandwidth"]) + if counts["bandwidth"] != 2 { + t.Errorf("wrong tag count \ngot: %d \nwant: 2", counts["bandwidth"]) + } +} + +func TestHasTag(t *testing.T) { + r := setup(t) + + if !r.HasTag(5, "util", "bandwidth") { + t.Errorf("Expected has tag") + } + if r.HasTag(4, "patho", "idle") { + t.Errorf("Expected has not tag") + } + if !r.HasTag(5, "patho", "idle") { + t.Errorf("Expected has tag") } } diff --git a/internal/repository/tags.go b/internal/repository/tags.go index 52bc836..a6a41b6 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -134,6 +134,19 @@ func (r *JobRepository) AddTagOrCreate(jobId int64, tagType string, tagName stri return tagId, nil } +func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool { + var id int64 + q := sq.Select("id").From("tag").Join("jobtag ON jobtag.tag_id = tag.id"). + Where("jobtag.job_id = ?", jobId).Where("tag.tag_type = ?", tagType). + Where("tag.tag_name = ?", tagName) + err := q.RunWith(r.stmtCache).QueryRow().Scan(&id) + if err != nil { + return false + } else { + return true + } +} + // TagId returns the database id of the tag with the specified type and name. func (r *JobRepository) TagId(tagType string, tagName string) (tagId int64, exists bool) { exists = true diff --git a/internal/tagger/apps/gromacs.txt b/internal/tagger/apps/gromacs.txt index e69de29..d8c0829 100644 --- a/internal/tagger/apps/gromacs.txt +++ b/internal/tagger/apps/gromacs.txt @@ -0,0 +1,3 @@ +GROMACS +gromacs +GMX diff --git a/internal/tagger/apps/openfoam.txt b/internal/tagger/apps/openfoam.txt index e69de29..542d645 100644 --- a/internal/tagger/apps/openfoam.txt +++ b/internal/tagger/apps/openfoam.txt @@ -0,0 +1 @@ +openfoam diff --git a/internal/tagger/apps/vasp.txt b/internal/tagger/apps/vasp.txt index e69de29..eec9092 100644 --- a/internal/tagger/apps/vasp.txt +++ b/internal/tagger/apps/vasp.txt @@ -0,0 +1,2 @@ +VASP +vasp diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 298151b..714fd27 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -4,17 +4,81 @@ // license that can be found in the LICENSE file. package tagger +import ( + "bufio" + "embed" + "fmt" + "path/filepath" + "strings" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + const tagType = "app" +//go:embed apps/* +var appFiles embed.FS + type appInfo struct { tag string strings []string } + type AppTagger struct { apps []appInfo } func (t *AppTagger) Register() error { + files, err := appFiles.ReadDir("apps") + if err != nil { + return fmt.Errorf("error reading app folder: %#v", err) + } + t.apps = make([]appInfo, 0) + + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + if err != nil { + return fmt.Errorf("error opening app file %s: %#v", fns, err) + } + scanner := bufio.NewScanner(f) + ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)} + + for scanner.Scan() { + ai.strings = append(ai.strings, scanner.Text()) + } + t.apps = append(t.apps, ai) + } return nil } + +func (t *AppTagger) Match(job *schema.Job) { + r := repository.GetJobRepository() + meta, err := r.FetchMetadata(job) + if err != nil { + log.Error("cannot fetch meta data") + } + jobscript, ok := meta["jobScript"] + if ok { + id := job.ID + + out: + for _, a := range t.apps { + tag := a.tag + for _, s := range a.strings { + if strings.Contains(jobscript, s) { + if !r.HasTag(id, tagType, tag) { + r.AddTagOrCreate(id, tagType, tag) + break out + } + } + } + } + } else { + log.Infof("Cannot extract job script for job: %d on %s", job.JobID, job.Cluster) + } +} diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go new file mode 100644 index 0000000..54a8dfd --- /dev/null +++ b/internal/tagger/detectApp_test.go @@ -0,0 +1,59 @@ +// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "testing" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" +) + +func setup(tb testing.TB) *repository.JobRepository { + tb.Helper() + log.Init("warn", true) + dbfile := "../repository/testdata/job.db" + err := repository.MigrateDB("sqlite3", dbfile) + noErr(tb, err) + repository.Connect("sqlite3", dbfile) + return repository.GetJobRepository() +} + +func noErr(tb testing.TB, err error) { + tb.Helper() + + if err != nil { + tb.Fatal("Error is not nil:", err) + } +} + +func TestRegister(t *testing.T) { + var tagger AppTagger + + err := tagger.Register() + noErr(t, err) + + if len(tagger.apps) != 3 { + t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 3", len(tagger.apps)) + } +} + +func TestMatch(t *testing.T) { + r := setup(t) + + job, err := r.FindById(5) + noErr(t, err) + + var tagger AppTagger + + err = tagger.Register() + noErr(t, err) + + tagger.Match(job) + + if !r.HasTag(5, "app", "vasp") { + t.Errorf("missing tag vasp") + } +} From 2502989ca2c33d654f923687db8c53c5b44c8b5b Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 28 Sep 2023 10:20:35 +0200 Subject: [PATCH 04/30] Refactor --- pkg/schema/validate.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/schema/validate.go b/pkg/schema/validate.go index aec234c..77b6dbb 100644 --- a/pkg/schema/validate.go +++ b/pkg/schema/validate.go @@ -28,12 +28,13 @@ const ( //go:embed schemas/* var schemaFiles embed.FS -func Validate(k Kind, r io.Reader) (err error) { +func Validate(k Kind, r io.Reader) error { jsonschema.Loaders["embedfs"] = func(s string) (io.ReadCloser, error) { f := filepath.Join("schemas", strings.Split(s, "//")[1]) return schemaFiles.Open(f) } var s *jsonschema.Schema + var err error switch k { case Meta: @@ -54,7 +55,7 @@ func Validate(k Kind, r io.Reader) (err error) { } var v interface{} - if err := json.NewDecoder(r).Decode(&v); err != nil { + if err = json.NewDecoder(r).Decode(&v); err != nil { log.Warnf("Error while decoding raw json schema: %#v", err) return err } From efbe53b6b45a79fe432814189a5e99b2a9dbbaa4 Mon Sep 17 00:00:00 2001 From: AmritanshuV <88365075+AmritanshuV@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:40:57 +0200 Subject: [PATCH 05/30] Rules --- internal/tagger/rules.json | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 internal/tagger/rules.json diff --git a/internal/tagger/rules.json b/internal/tagger/rules.json new file mode 100644 index 0000000..c88afb4 --- /dev/null +++ b/internal/tagger/rules.json @@ -0,0 +1,21 @@ +{ + "and": [ + { + "in": [ + "a40", + { + "var": "metaData.jobScript" + } + ] + }, + { + ">": [ + { + "var": "statistics.clock.min" + }, + 2000 + ] + } + ] + } + \ No newline at end of file From 7abdd0545e5a2ad7e1906411f38c1397185e8ef9 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 07:24:24 +0200 Subject: [PATCH 06/30] Add api for tag handling within cc-backend --- internal/graph/schema.resolvers.go | 37 +++++++------- internal/repository/tags.go | 78 +++++++++++++++++++++++++++++- internal/tagger/detectApp.go | 6 +-- internal/tagger/detectApp_test.go | 2 +- 4 files changed, 98 insertions(+), 25 deletions(-) diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index f3fc389..7e52b3d 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -143,7 +143,7 @@ func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name s return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil } else { log.Warnf("Not authorized to create tag with scope: %s", scope) - return nil, fmt.Errorf("Not authorized to create tag with scope: %s", scope) + return nil, fmt.Errorf("not authorized to create tag with scope: %s", scope) } } @@ -179,7 +179,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds _, _, tscope, exists := r.Repo.TagInfo(tid) if !exists { log.Warnf("Tag does not exist (ID): %d", tid) - return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) + return nil, fmt.Errorf("tag does not exist (ID): %d", tid) } // Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag @@ -193,7 +193,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds } } else { log.Warnf("Not authorized to add tag: %d", tid) - return nil, fmt.Errorf("Not authorized to add tag: %d", tid) + return nil, fmt.Errorf("not authorized to add tag: %d", tid) } } @@ -226,7 +226,7 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta _, _, tscope, exists := r.Repo.TagInfo(tid) if !exists { log.Warnf("Tag does not exist (ID): %d", tid) - return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) + return nil, fmt.Errorf("tag does not exist (ID): %d", tid) } // Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag @@ -240,7 +240,7 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta } } else { log.Warnf("Not authorized to remove tag: %d", tid) - return nil, fmt.Errorf("Not authorized to remove tag: %d", tid) + return nil, fmt.Errorf("not authorized to remove tag: %d", tid) } } @@ -269,7 +269,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin _, _, tscope, exists := r.Repo.TagInfo(tid) if !exists { log.Warnf("Tag does not exist (ID): %d", tid) - return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) + return nil, fmt.Errorf("tag does not exist (ID): %d", tid) } // Test Access: Admins && Admin Tag OR Everyone && Private Tag @@ -283,7 +283,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin } } else { log.Warnf("Not authorized to remove tag: %d", tid) - return nil, fmt.Errorf("Not authorized to remove tag: %d", tid) + return nil, fmt.Errorf("not authorized to remove tag: %d", tid) } } return tags, nil @@ -499,10 +499,7 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag return nil, err } - hasNextPage := false - if len(nextJobs) == 1 { - hasNextPage = true - } + hasNextPage := len(nextJobs) == 1 return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil } @@ -513,8 +510,8 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF var stats []*model.JobsStatistics // Top Level Defaults - var defaultDurationBins string = "1h" - var defaultMetricBins int = 10 + defaultDurationBins := "1h" + defaultMetricBins := 10 if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") || requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") { @@ -779,9 +776,11 @@ func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} } // SubCluster returns generated.SubClusterResolver implementation. func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} } -type clusterResolver struct{ *Resolver } -type jobResolver struct{ *Resolver } -type metricValueResolver struct{ *Resolver } -type mutationResolver struct{ *Resolver } -type queryResolver struct{ *Resolver } -type subClusterResolver struct{ *Resolver } +type ( + clusterResolver struct{ *Resolver } + jobResolver struct{ *Resolver } + metricValueResolver struct{ *Resolver } + mutationResolver struct{ *Resolver } + queryResolver struct{ *Resolver } + subClusterResolver struct{ *Resolver } +) diff --git a/internal/repository/tags.go b/internal/repository/tags.go index d07c4d2..a9416c4 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -45,6 +45,36 @@ func (r *JobRepository) AddTag(user *schema.User, job int64, tag int64) ([]*sche return tags, archive.UpdateTags(j, archiveTags) } +func (r *JobRepository) AddTagDirect(job int64, tag int64) ([]*schema.Tag, error) { + j, err := r.FindByIdDirect(job) + if err != nil { + log.Warn("Error while finding job by id") + return nil, err + } + + q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(job, tag) + + if _, err := q.RunWith(r.stmtCache).Exec(); err != nil { + s, _, _ := q.ToSql() + log.Errorf("Error adding tag with %s: %v", s, err) + return nil, err + } + + tags, err := r.GetTagsDirect(&job) + if err != nil { + log.Warn("Error while getting tags for job") + return nil, err + } + + archiveTags, err := r.getArchiveTags(&job) + if err != nil { + log.Warn("Error while getting tags for job") + return nil, err + } + + return tags, archive.UpdateTags(j, archiveTags) +} + // Removes a tag from a job by tag id func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) { j, err := r.FindByIdWithUser(user, job) @@ -82,7 +112,7 @@ func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagT tagID, exists := r.TagId(tagType, tagName, tagScope) if !exists { log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) - return nil, fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) + return nil, fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } // Get Job @@ -122,7 +152,7 @@ func (r *JobRepository) RemoveTagByRequest(tagType string, tagName string, tagSc tagID, exists := r.TagId(tagType, tagName, tagScope) if !exists { log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) - return fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) + return fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } // Handle Delete JobTagTable @@ -291,6 +321,24 @@ func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType s return tagId, nil } +func (r *JobRepository) AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) { + tagScope := "global" + + tagId, exists := r.TagId(tagType, tagName, tagScope) + if !exists { + tagId, err = r.CreateTag(tagType, tagName, tagScope) + if err != nil { + return 0, err + } + } + + if _, err := r.AddTagDirect(jobId, tagId); err != nil { + return 0, err + } + + return tagId, nil +} + func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool { var id int64 q := sq.Select("id").From("tag").Join("jobtag ON jobtag.tag_id = tag.id"). @@ -359,6 +407,32 @@ func (r *JobRepository) GetTags(user *schema.User, job *int64) ([]*schema.Tag, e return tags, nil } +func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) { + q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") + if job != nil { + q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job) + } + + rows, err := q.RunWith(r.stmtCache).Query() + if err != nil { + s, _, _ := q.ToSql() + log.Errorf("Error get tags with %s: %v", s, err) + return nil, err + } + + tags := make([]*schema.Tag, 0) + for rows.Next() { + tag := &schema.Tag{} + if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { + log.Warn("Error while scanning rows") + return nil, err + } + tags = append(tags, tag) + } + + return tags, nil +} + // GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has. func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 714fd27..339e398 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -1,5 +1,5 @@ -// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. package tagger @@ -72,7 +72,7 @@ func (t *AppTagger) Match(job *schema.Job) { for _, s := range a.strings { if strings.Contains(jobscript, s) { if !r.HasTag(id, tagType, tag) { - r.AddTagOrCreate(id, tagType, tag) + r.AddTagOrCreateDirect(id, tagType, tag) break out } } diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go index 54a8dfd..8978e35 100644 --- a/internal/tagger/detectApp_test.go +++ b/internal/tagger/detectApp_test.go @@ -43,7 +43,7 @@ func TestRegister(t *testing.T) { func TestMatch(t *testing.T) { r := setup(t) - job, err := r.FindById(5) + job, err := r.FindByIdDirect(5) noErr(t, err) var tagger AppTagger From fe1ff5c7a3da53d0fe161814d34638a1ea35a16c Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 07:33:33 +0200 Subject: [PATCH 07/30] Update tests from dev --- internal/repository/job_test.go | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/internal/repository/job_test.go b/internal/repository/job_test.go index 70d8053..363bb6c 100644 --- a/internal/repository/job_test.go +++ b/internal/repository/job_test.go @@ -18,7 +18,9 @@ func TestFind(t *testing.T) { jobId, cluster, startTime := int64(398998), "fritz", int64(1675957496) job, err := r.Find(&jobId, &cluster, &startTime) - noErr(t, err) + if err != nil { + t.Fatal(err) + } // fmt.Printf("%+v", job) @@ -65,21 +67,7 @@ func TestGetTags(t *testing.T) { fmt.Printf("TAGS %+v \n", tags) // fmt.Printf("COUNTS %+v \n", counts) - if counts["bandwidth"] != 2 { - t.Errorf("wrong tag count \ngot: %d \nwant: 2", counts["bandwidth"]) - } -} - -func TestHasTag(t *testing.T) { - r := setup(t) - - if !r.HasTag(5, "util", "bandwidth") { - t.Errorf("Expected has tag") - } - if r.HasTag(4, "patho", "idle") { - t.Errorf("Expected has not tag") - } - if !r.HasTag(5, "patho", "idle") { - t.Errorf("Expected has tag") + if counts["bandwidth"] != 0 { + t.Errorf("wrong tag count \ngot: %d \nwant: 0", counts["bandwidth"]) } } From 432e06e801f0f90cb9dab609e5b92d434fc58390 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:19:56 +0200 Subject: [PATCH 08/30] Add GoString method for jobmeta --- pkg/schema/job.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/schema/job.go b/pkg/schema/job.go index 5e3110b..df901b4 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -145,7 +145,12 @@ const ( JobStateOutOfMemory JobState = "out_of_memory" ) -func (e *JobState) UnmarshalGQL(v interface{}) error { +func (j JobMeta) GoString() string { + return fmt.Sprintf("JobMeta{ID:%d, StartTime:%d, JobID:%v, BaseJob:%v}", + j.ID, j.StartTime, j.JobID, j.BaseJob) +} + +func (e *JobState) UnmarshalGQL(v any) error { str, ok := v.(string) if !ok { return fmt.Errorf("SCHEMA/JOB > enums must be strings") From eab7961a83ef1604fc496d75d6cae95249dc815a Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:32:19 +0200 Subject: [PATCH 09/30] Introduce caching table for faster job inserts Fixes #392 --- internal/repository/job.go | 22 ++------- internal/repository/jobCreate.go | 47 ++++++++++++++++++- internal/repository/jobFind.go | 20 ++++++++ internal/repository/migration.go | 13 ++++- .../sqlite3/09_add-job-cache.down.sql | 1 + .../sqlite3/09_add-job-cache.up.sql | 31 ++++++++++++ 6 files changed, 114 insertions(+), 20 deletions(-) create mode 100644 internal/repository/migrations/sqlite3/09_add-job-cache.down.sql create mode 100644 internal/repository/migrations/sqlite3/09_add-job-cache.up.sql diff --git a/internal/repository/job.go b/internal/repository/job.go index 84de6f7..54a436a 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -9,12 +9,12 @@ import ( "encoding/json" "errors" "fmt" + "maps" "math" "strconv" "sync" "time" - "github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/lrucache" @@ -33,6 +33,7 @@ type JobRepository struct { stmtCache *sq.StmtCache cache *lrucache.Cache driver string + Mutex sync.Mutex } func GetJobRepository() *JobRepository { @@ -56,7 +57,7 @@ var jobColumns []string = []string{ "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", } -func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { +func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { job := &schema.Job{} if err := row.Scan( @@ -138,17 +139,6 @@ func (r *JobRepository) Flush() error { return nil } -func scanJobLink(row interface{ Scan(...interface{}) error }) (*model.JobLink, error) { - jobLink := &model.JobLink{} - if err := row.Scan( - &jobLink.ID, &jobLink.JobID); err != nil { - log.Warn("Error while scanning rows (jobLink)") - return nil, err - } - - return jobLink, nil -} - func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) { start := time.Now() cachekey := fmt.Sprintf("metadata:%d", job.ID) @@ -189,9 +179,7 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er if job.MetaData != nil { cpy := make(map[string]string, len(job.MetaData)+1) - for k, v := range job.MetaData { - cpy[k] = v - } + maps.Copy(cpy, job.MetaData) cpy[key] = val job.MetaData = cpy } else { @@ -389,7 +377,7 @@ func (r *JobRepository) FindColumnValues(user *schema.User, query string, table func (r *JobRepository) Partitions(cluster string) ([]string, error) { var err error start := time.Now() - partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) { + partitions := r.cache.Get("partitions:"+cluster, func() (any, time.Duration, int) { parts := []string{} if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil { return nil, 0, 1000 diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index 9e47974..3b997f3 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -13,6 +13,14 @@ import ( sq "github.com/Masterminds/squirrel" ) +const NamedJobCacheInsert string = `INSERT INTO job_cache ( + job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, + exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data +) VALUES ( + :job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, + :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data +);` + const NamedJobInsert string = `INSERT INTO job ( job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data @@ -22,7 +30,9 @@ const NamedJobInsert string = `INSERT INTO job ( );` func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { - res, err := r.DB.NamedExec(NamedJobInsert, job) + r.Mutex.Lock() + res, err := r.DB.NamedExec(NamedJobCacheInsert, job) + r.Mutex.Unlock() if err != nil { log.Warn("Error while NamedJobInsert") return 0, err @@ -36,6 +46,25 @@ func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { return id, nil } +func (r *JobRepository) SyncJobs() error { + r.Mutex.Lock() + defer r.Mutex.Unlock() + _, err := r.DB.Exec( + "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") + if err != nil { + log.Warnf("Error while Job sync: %v", err) + return err + } + + _, err = r.DB.Exec("DELETE FROM job_cache") + if err != nil { + log.Warn("Error while Job cache clean") + return err + } + + return nil +} + // Start inserts a new job in the table, returning the unique job ID. // Statistics are not transfered! func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) { @@ -73,3 +102,19 @@ func (r *JobRepository) Stop( _, err = stmt.RunWith(r.stmtCache).Exec() return } + +func (r *JobRepository) StopCached( + jobId int64, + duration int32, + state schema.JobState, + monitoringStatus int32, +) (err error) { + stmt := sq.Update("job_cache"). + Set("job_state", state). + Set("duration", duration). + Set("monitoring_status", monitoringStatus). + Where("job.id = ?", jobId) + + _, err = stmt.RunWith(r.stmtCache).Exec() + return +} diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index 1e2ccb8..ac09355 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -43,6 +43,26 @@ func (r *JobRepository) Find( return scanJob(q.RunWith(r.stmtCache).QueryRow()) } +func (r *JobRepository) FindCached( + jobId *int64, + cluster *string, + startTime *int64, +) (*schema.Job, error) { + q := sq.Select(jobColumns...).From("job_cache"). + Where("job_cache.job_id = ?", *jobId) + + if cluster != nil { + q = q.Where("job_cache.cluster = ?", *cluster) + } + if startTime != nil { + q = q.Where("job_cache.start_time = ?", *startTime) + } + + q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match + + return scanJob(q.RunWith(r.stmtCache).QueryRow()) +} + // Find executes a SQL query to find a specific batch job. // The job is queried using the batch job id, the cluster name, // and the start time of the job in UNIX epoch time seconds. diff --git a/internal/repository/migration.go b/internal/repository/migration.go index 0b2591e..c0693da 100644 --- a/internal/repository/migration.go +++ b/internal/repository/migration.go @@ -16,7 +16,7 @@ import ( "github.com/golang-migrate/migrate/v4/source/iofs" ) -const Version uint = 8 +const Version uint = 9 //go:embed migrations/* var migrationFiles embed.FS @@ -115,8 +115,17 @@ func MigrateDB(backend string, db string) error { } v, dirty, err := m.Version() + if err != nil { + if err == migrate.ErrNilVersion { + log.Warn("Legacy database without version or missing database file!") + } else { + return err + } + } - log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version) + if v < Version { + log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version) + } if dirty { return fmt.Errorf("last migration to version %d has failed, please fix the db manually and force version with -force-db flag", Version) diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.down.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.down.sql new file mode 100644 index 0000000..ef257cf --- /dev/null +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.down.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS job_cache; diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql new file mode 100644 index 0000000..7840369 --- /dev/null +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql @@ -0,0 +1,31 @@ +CREATE TABLE "job_cache" ( + id INTEGER PRIMARY KEY, + job_id BIGINT NOT NULL, + cluster VARCHAR(255) NOT NULL, + subcluster VARCHAR(255) NOT NULL, + start_time BIGINT NOT NULL, -- Unix timestamp + hpc_user VARCHAR(255) NOT NULL, + project VARCHAR(255) NOT NULL, + cluster_partition VARCHAR(255), + array_job_id BIGINT, + duration INT NOT NULL, + walltime INT NOT NULL, + job_state VARCHAR(255) NOT NULL + CHECK (job_state IN ( + 'running', 'completed', 'failed', 'cancelled', + 'stopped', 'timeout', 'preempted', 'out_of_memory' + )), + meta_data TEXT, -- JSON + resources TEXT NOT NULL, -- JSON + num_nodes INT NOT NULL, + num_hwthreads INT, + num_acc INT, + smt TINYINT NOT NULL DEFAULT 1 CHECK (smt IN (0, 1)), + exclusive TINYINT NOT NULL DEFAULT 1 CHECK (exclusive IN (0, 1, 2)), + monitoring_status TINYINT NOT NULL DEFAULT 1 + CHECK (monitoring_status IN (0, 1, 2, 3)), + energy REAL NOT NULL DEFAULT 0.0, + energy_footprint TEXT DEFAULT NULL, + footprint TEXT DEFAULT NULL, + UNIQUE (job_id, cluster, start_time) +); From 40110580e080fbd1188ba7ef3e2154f2f97ef768 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:33:44 +0200 Subject: [PATCH 10/30] feat: add job hook support Fixes #394 --- internal/repository/jobHooks.go | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 internal/repository/jobHooks.go diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go new file mode 100644 index 0000000..d69874f --- /dev/null +++ b/internal/repository/jobHooks.go @@ -0,0 +1,34 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +type JobHook interface { + jobStartCallback() + jobStopCallback() +} + +var hooks []JobHook + +func RegisterJobJook(hook JobHook) { + if hook != nil { + hooks = append(hooks, hook) + } +} + +func CallJobStartHooks() { + for _, hook := range hooks { + if hook != nil { + hook.jobStartCallback() + } + } +} + +func CallJobStopHooks() { + for _, hook := range hooks { + if hook != nil { + hook.jobStopCallback() + } + } +} From d76b1ae75dc1a98395dfb05b722ce17a1f623800 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:36:33 +0200 Subject: [PATCH 11/30] feat: add job commit service Sync jobs from job cache table to main job table. Enables #392 --- internal/taskManager/commitJobService.go | 35 ++++++++++++++++++++++++ internal/taskManager/taskManager.go | 1 + pkg/schema/config.go | 4 ++- 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 internal/taskManager/commitJobService.go diff --git a/internal/taskManager/commitJobService.go b/internal/taskManager/commitJobService.go new file mode 100644 index 0000000..7749348 --- /dev/null +++ b/internal/taskManager/commitJobService.go @@ -0,0 +1,35 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package taskManager + +import ( + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/go-co-op/gocron/v2" +) + +func RegisterCommitJobService() { + var frequency string + if config.Keys.CronFrequency != nil && config.Keys.CronFrequency.CommitJobWorker != "" { + frequency = config.Keys.CronFrequency.CommitJobWorker + } else { + frequency = "2m" + } + d, _ := time.ParseDuration(frequency) + log.Infof("Register commitJob service with %s interval", frequency) + + s.NewJob(gocron.DurationJob(d), + gocron.NewTask( + func() { + start := time.Now() + log.Printf("Jobcache sync started at %s", start.Format(time.RFC3339)) + jobRepo.SyncJobs() + repository.CallJobStartHooks() + log.Printf("Jobcache sync is done and took %s", time.Since(start)) + })) +} diff --git a/internal/taskManager/taskManager.go b/internal/taskManager/taskManager.go index 2004e0d..7d9a3a2 100644 --- a/internal/taskManager/taskManager.go +++ b/internal/taskManager/taskManager.go @@ -81,6 +81,7 @@ func Start() { RegisterFootprintWorker() RegisterUpdateDurationWorker() + RegisterCommitJobService() s.Start() } diff --git a/pkg/schema/config.go b/pkg/schema/config.go index 27d11be..a5caa61 100644 --- a/pkg/schema/config.go +++ b/pkg/schema/config.go @@ -89,6 +89,8 @@ type ResampleConfig struct { } type CronFrequency struct { + // Duration Update Worker [Defaults to '2m'] + CommitJobWorker string `json:"commit-job-worker"` // Duration Update Worker [Defaults to '5m'] DurationWorker string `json:"duration-worker"` // Metric-Footprint Update Worker [Defaults to '10m'] @@ -150,7 +152,7 @@ type ProgramConfig struct { // If overwritten, at least all the options in the defaults below must // be provided! Most options here can be overwritten by the user. - UiDefaults map[string]interface{} `json:"ui-defaults"` + UiDefaults map[string]any `json:"ui-defaults"` // If exists, will enable dynamic zoom in frontend metric plots using the configured values EnableResampling *ResampleConfig `json:"enable-resampling"` From 2e781b900d4b5b9b50a212f53dd38759cc598bf7 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:37:24 +0200 Subject: [PATCH 12/30] Staged error handling for job cache --- internal/api/rest.go | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/internal/api/rest.go b/internal/api/rest.go index 669768e..e0804cb 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -820,7 +820,7 @@ func (api *RestApi) removeTags(rw http.ResponseWriter, r *http.Request) { } rw.WriteHeader(http.StatusOK) - rw.Write([]byte(fmt.Sprintf("Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount))) + fmt.Fprintf(rw, "Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount) } // startJob godoc @@ -846,6 +846,7 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { return } + log.Printf("REST: %s\n", req.GoString()) req.State = schema.JobStateRunning if err := importer.SanityChecks(&req.BaseJob); err != nil { @@ -931,8 +932,12 @@ func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { // log.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req) job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) if err != nil { - handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) - return + job, err = api.JobRepository.FindCached(req.JobId, req.Cluster, req.StartTime) + // FIXME: Previous error is hidden + if err != nil { + handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } } api.checkAndHandleStopJob(rw, job, req) @@ -1097,10 +1102,15 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo // Mark job as stopped in the database (update state and duration) job.Duration = int32(req.StopTime - job.StartTime.Unix()) job.State = req.State + api.JobRepository.Mutex.Lock() if err := api.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) - return + if err := api.JobRepository.StopCached(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + api.JobRepository.Mutex.Unlock() + handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + return + } } + api.JobRepository.Mutex.Unlock() log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) @@ -1116,6 +1126,8 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo return } + repository.CallJobStopHooks() + // Trigger async archiving archiver.TriggerArchiving(job) } From f06b5f8fc0c6196f4caa31cab856b3ad992803f3 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:37:36 +0200 Subject: [PATCH 13/30] Refactor --- internal/auth/auth.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/auth/auth.go b/internal/auth/auth.go index 5f88bbb..3e57768 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -237,7 +237,7 @@ func (auth *Authentication) Login( limiter := getIPUserLimiter(ip, username) if !limiter.Allow() { log.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username) - onfailure(rw, r, errors.New("Too many login attempts, try again in a few minutes.")) + onfailure(rw, r, errors.New("too many login attempts, try again in a few minutes")) return } From f30b784f45baf0b943be0176818d7e5728e70db3 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:38:00 +0200 Subject: [PATCH 14/30] Attempt to fix api test Tests still fail --- internal/api/api_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/internal/api/api_test.go b/internal/api/api_test.go index e67813c..2e864a3 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -253,6 +253,7 @@ func TestRestApi(t *testing.T) { t.Fatal(response.Status, recorder.Body.String()) } resolver := graph.GetResolverInstance() + restapi.JobRepository.SyncJobs() job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) @@ -312,7 +313,7 @@ func TestRestApi(t *testing.T) { } archiver.WaitForArchiving() - job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) + job, err := restapi.JobRepository.FindCached(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } @@ -425,7 +426,7 @@ func TestRestApi(t *testing.T) { archiver.WaitForArchiving() jobid, cluster := int64(12345), "testcluster" - job, err := restapi.JobRepository.Find(&jobid, &cluster, nil) + job, err := restapi.JobRepository.FindCached(&jobid, &cluster, nil) if err != nil { t.Fatal(err) } From 99f8187092d6aab9d1c9c20399b9a2a1cfb7202e Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 19 May 2025 09:17:16 +0200 Subject: [PATCH 15/30] Port tests to new architecture --- internal/api/api_test.go | 30 +++++++++++++++--------------- internal/importer/importer_test.go | 2 +- internal/repository/job.go | 11 ++++++++--- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 2e864a3..3af37ad 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -123,7 +123,7 @@ func setup(t *testing.T) *api.RestApi { t.Fatal(err) } - if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil { + if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 2), 0666); err != nil { t.Fatal(err) } @@ -204,11 +204,11 @@ func TestRestApi(t *testing.T) { restapi.MountApiRoutes(r) var TestJobId int64 = 123 - var TestClusterName string = "testcluster" + TestClusterName := "testcluster" var TestStartTime int64 = 123456789 const startJobBody string = `{ - "jobId": 123, + "jobId": 123, "user": "testuser", "project": "testproj", "cluster": "testcluster", @@ -221,7 +221,6 @@ func TestRestApi(t *testing.T) { "exclusive": 1, "monitoringStatus": 1, "smt": 1, - "tags": [{ "type": "testTagType", "name": "testTagName", "scope": "testuser" }], "resources": [ { "hostname": "host123", @@ -252,17 +251,17 @@ func TestRestApi(t *testing.T) { if response.StatusCode != http.StatusCreated { t.Fatal(response.Status, recorder.Body.String()) } - resolver := graph.GetResolverInstance() + // resolver := graph.GetResolverInstance() restapi.JobRepository.SyncJobs() job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } - job.Tags, err = resolver.Job().Tags(ctx, job) - if err != nil { - t.Fatal(err) - } + // job.Tags, err = resolver.Job().Tags(ctx, job) + // if err != nil { + // t.Fatal(err) + // } if job.JobID != 123 || job.User != "testuser" || @@ -283,9 +282,9 @@ func TestRestApi(t *testing.T) { t.Fatalf("unexpected job properties: %#v", job) } - if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" { - t.Fatalf("unexpected tags: %#v", job.Tags) - } + // if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" { + // t.Fatalf("unexpected tags: %#v", job.Tags) + // } }); !ok { return } @@ -313,7 +312,7 @@ func TestRestApi(t *testing.T) { } archiver.WaitForArchiving() - job, err := restapi.JobRepository.FindCached(&TestJobId, &TestClusterName, &TestStartTime) + job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } @@ -353,7 +352,7 @@ func TestRestApi(t *testing.T) { t.Run("CheckDoubleStart", func(t *testing.T) { // Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart! - body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1) + body := strings.ReplaceAll(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`) req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body))) recorder := httptest.NewRecorder() @@ -403,6 +402,7 @@ func TestRestApi(t *testing.T) { } time.Sleep(1 * time.Second) + restapi.JobRepository.SyncJobs() const stopJobBodyFailed string = `{ "jobId": 12345, @@ -426,7 +426,7 @@ func TestRestApi(t *testing.T) { archiver.WaitForArchiving() jobid, cluster := int64(12345), "testcluster" - job, err := restapi.JobRepository.FindCached(&jobid, &cluster, nil) + job, err := restapi.JobRepository.Find(&jobid, &cluster, nil) if err != nil { t.Fatal(err) } diff --git a/internal/importer/importer_test.go b/internal/importer/importer_test.go index 209b6be..d2bb0b4 100644 --- a/internal/importer/importer_test.go +++ b/internal/importer/importer_test.go @@ -166,7 +166,7 @@ func TestHandleImportFlag(t *testing.T) { } result := readResult(t, testname) - job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime) + job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime) if err != nil { t.Fatal(err) } diff --git a/internal/repository/job.go b/internal/repository/job.go index 54a436a..29aa63e 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -51,10 +51,15 @@ func GetJobRepository() *JobRepository { return jobRepoInstance } +// var jobColumns []string = []string{ +// "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", +// "job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", +// "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", +// } + var jobColumns []string = []string{ - "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", - "job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", - "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", + "id", "job_id", "hpc_user", "project", "cluster", "subcluster", "start_time", "cluster_partition", "array_job_id", "num_nodes", "num_hwthreads", "num_acc", "exclusive", "monitoring_status", "smt", "job_state", + "duration", "walltime", "resources", "footprint", "energy", } func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { From 14bad81b9fd46ceca683aaffbe64566fb7b37972 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 19 May 2025 13:25:39 +0200 Subject: [PATCH 16/30] Extend Job Hooks and add unit tests Add job tagger control --- internal/api/rest.go | 2 +- internal/repository/jobCreate.go | 32 +++++++++++++++---- internal/repository/jobHooks.go | 37 +++++++++++++++++----- internal/tagger/apps/python.txt | 3 ++ internal/tagger/detectApp.go | 28 +++++++++++------ internal/tagger/detectApp_test.go | 2 +- internal/tagger/tagger.go | 40 ++++++++++++++++++++++-- internal/tagger/tagger_test.go | 31 ++++++++++++++++++ internal/taskManager/commitJobService.go | 6 ++-- 9 files changed, 150 insertions(+), 31 deletions(-) create mode 100644 internal/tagger/apps/python.txt create mode 100644 internal/tagger/tagger_test.go diff --git a/internal/api/rest.go b/internal/api/rest.go index e0804cb..6133a5e 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -1126,7 +1126,7 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo return } - repository.CallJobStopHooks() + repository.CallJobStopHooks(job) // Trigger async archiving archiver.TriggerArchiving(job) diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index 3b997f3..a651db9 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -46,23 +46,43 @@ func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { return id, nil } -func (r *JobRepository) SyncJobs() error { +func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { r.Mutex.Lock() defer r.Mutex.Unlock() - _, err := r.DB.Exec( + + query := sq.Select(jobColumns...).From("job_cache") + + rows, err := query.RunWith(r.stmtCache).Query() + if err != nil { + log.Errorf("Error while running query %v", err) + return nil, err + } + + jobs := make([]*schema.Job, 0, 50) + for rows.Next() { + job, err := scanJob(rows) + if err != nil { + rows.Close() + log.Warn("Error while scanning rows") + return nil, err + } + jobs = append(jobs, job) + } + + _, err = r.DB.Exec( "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") if err != nil { log.Warnf("Error while Job sync: %v", err) - return err + return nil, err } _, err = r.DB.Exec("DELETE FROM job_cache") if err != nil { - log.Warn("Error while Job cache clean") - return err + log.Warnf("Error while Job cache clean: %v", err) + return nil, err } - return nil + return jobs, nil } // Start inserts a new job in the table, returning the unique job ID. diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index d69874f..1016335 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -4,31 +4,54 @@ // license that can be found in the LICENSE file. package repository +import ( + "sync" + + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + type JobHook interface { - jobStartCallback() - jobStopCallback() + JobStartCallback(job *schema.Job) + JobStopCallback(job *schema.Job) } -var hooks []JobHook +var ( + initOnce sync.Once + hooks []JobHook +) func RegisterJobJook(hook JobHook) { + initOnce.Do(func() { + hooks = make([]JobHook, 0) + }) + if hook != nil { hooks = append(hooks, hook) } } -func CallJobStartHooks() { +func CallJobStartHooks(jobs []*schema.Job) { + if hooks == nil { + return + } + for _, hook := range hooks { if hook != nil { - hook.jobStartCallback() + for _, job := range jobs { + hook.JobStartCallback(job) + } } } } -func CallJobStopHooks() { +func CallJobStopHooks(job *schema.Job) { + if hooks == nil { + return + } + for _, hook := range hooks { if hook != nil { - hook.jobStopCallback() + hook.JobStopCallback(job) } } } diff --git a/internal/tagger/apps/python.txt b/internal/tagger/apps/python.txt new file mode 100644 index 0000000..7a5c661 --- /dev/null +++ b/internal/tagger/apps/python.txt @@ -0,0 +1,3 @@ +python +anaconda +conda diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 339e398..44a08e0 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -8,6 +8,7 @@ import ( "bufio" "embed" "fmt" + "io/fs" "path/filepath" "strings" @@ -27,16 +28,10 @@ type appInfo struct { } type AppTagger struct { - apps []appInfo + apps map[string]appInfo } -func (t *AppTagger) Register() error { - files, err := appFiles.ReadDir("apps") - if err != nil { - return fmt.Errorf("error reading app folder: %#v", err) - } - t.apps = make([]appInfo, 0) - +func (t *AppTagger) scanApps(files []fs.DirEntry) error { for _, fn := range files { fns := fn.Name() log.Debugf("Process: %s", fns) @@ -50,12 +45,25 @@ func (t *AppTagger) Register() error { for scanner.Scan() { ai.strings = append(ai.strings, scanner.Text()) } - t.apps = append(t.apps, ai) + delete(t.apps, ai.tag) + t.apps[ai.tag] = ai } - return nil } +// func (t *AppTagger) Reload() error { +// +// } + +func (t *AppTagger) Register() error { + files, err := appFiles.ReadDir("apps") + if err != nil { + return fmt.Errorf("error reading app folder: %#v", err) + } + t.apps = make(map[string]appInfo, 0) + return t.scanApps(files) +} + func (t *AppTagger) Match(job *schema.Job) { r := repository.GetJobRepository() meta, err := r.FetchMetadata(job) diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go index 8978e35..3b43cce 100644 --- a/internal/tagger/detectApp_test.go +++ b/internal/tagger/detectApp_test.go @@ -35,7 +35,7 @@ func TestRegister(t *testing.T) { err := tagger.Register() noErr(t, err) - if len(tagger.apps) != 3 { + if len(tagger.apps) != 4 { t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 3", len(tagger.apps)) } } diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index 52a369b..4fbbc9e 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -4,14 +4,48 @@ // license that can be found in the LICENSE file. package tagger -import "github.com/ClusterCockpit/cc-backend/pkg/schema" +import ( + "sync" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) type Tagger interface { Register() error Match(job *schema.Job) } -func Init() error { +var ( + initOnce sync.Once + jobTagger *JobTagger +) - return nil +type JobTagger struct { + startTaggers []Tagger + stopTaggers []Tagger +} + +func Init() { + initOnce.Do(func() { + jobTagger = &JobTagger{} + jobTagger.startTaggers = make([]Tagger, 0) + jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) + + for _, tagger := range jobTagger.startTaggers { + tagger.Register() + } + + // jobTagger.stopTaggers = make([]Tagger, 0) + repository.RegisterJobJook(jobTagger) + }) +} + +func (jt *JobTagger) JobStartCallback(job *schema.Job) { + for _, tagger := range jobTagger.startTaggers { + tagger.Match(job) + } +} + +func (jt *JobTagger) JobStopCallback(job *schema.Job) { } diff --git a/internal/tagger/tagger_test.go b/internal/tagger/tagger_test.go new file mode 100644 index 0000000..057ca17 --- /dev/null +++ b/internal/tagger/tagger_test.go @@ -0,0 +1,31 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "testing" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +func TestInit(t *testing.T) { + Init() +} + +func TestJobStartCallback(t *testing.T) { + Init() + r := setup(t) + job, err := r.FindByIdDirect(2) + noErr(t, err) + + jobs := make([]*schema.Job, 0, 1) + jobs = append(jobs, job) + + repository.CallJobStartHooks(jobs) + if !r.HasTag(2, "app", "python") { + t.Errorf("missing tag python") + } +} diff --git a/internal/taskManager/commitJobService.go b/internal/taskManager/commitJobService.go index 7749348..c60acb3 100644 --- a/internal/taskManager/commitJobService.go +++ b/internal/taskManager/commitJobService.go @@ -28,8 +28,8 @@ func RegisterCommitJobService() { func() { start := time.Now() log.Printf("Jobcache sync started at %s", start.Format(time.RFC3339)) - jobRepo.SyncJobs() - repository.CallJobStartHooks() - log.Printf("Jobcache sync is done and took %s", time.Since(start)) + jobs, _ := jobRepo.SyncJobs() + repository.CallJobStartHooks(jobs) + log.Printf("Jobcache sync and job callbacks are done and took %s", time.Since(start)) })) } From 85f17c0fd85fff07e14009f85566927955477f25 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 19 May 2025 16:08:43 +0200 Subject: [PATCH 17/30] Refactor Tagger package. Add fsNotify Service --- cmd/cc-backend/main.go | 5 +++ go.mod | 1 + go.sum | 2 + internal/repository/job.go | 6 ++- internal/tagger/detectApp.go | 64 ++++++++++++++++++++++--------- internal/tagger/tagger.go | 1 - internal/util/fswatcher.go | 73 ++++++++++++++++++++++++++++++++++++ 7 files changed, 131 insertions(+), 21 deletions(-) create mode 100644 internal/util/fswatcher.go diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 4b6d7f9..cbfccef 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -19,7 +19,9 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/importer" "github.com/ClusterCockpit/cc-backend/internal/metricdata" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/tagger" "github.com/ClusterCockpit/cc-backend/internal/taskManager" + "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv" @@ -216,6 +218,7 @@ func main() { } archiver.Start(repository.GetJobRepository()) + tagger.Init() taskManager.Start() serverInit() @@ -237,6 +240,8 @@ func main() { serverShutdown() + util.FsWatcherShutdown() + taskManager.Shutdown() }() diff --git a/go.mod b/go.mod index 98d1cab..f17ec18 100644 --- a/go.mod +++ b/go.mod @@ -44,6 +44,7 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect github.com/go-jose/go-jose/v4 v4.0.5 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect diff --git a/go.sum b/go.sum index a76e112..57b1649 100644 --- a/go.sum +++ b/go.sum @@ -55,6 +55,8 @@ github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4 github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-asn1-ber/asn1-ber v1.5.7 h1:DTX+lbVTWaTw1hQ+PbZPlnDZPEIs0SS/GCZAl535dDk= github.com/go-asn1-ber/asn1-ber v1.5.7/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-co-op/gocron/v2 v2.16.0 h1:uqUF6WFZ4enRU45pWFNcn1xpDLc+jBOTKhPQI16Z1xs= diff --git a/internal/repository/job.go b/internal/repository/job.go index 29aa63e..73a2588 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -58,8 +58,10 @@ func GetJobRepository() *JobRepository { // } var jobColumns []string = []string{ - "id", "job_id", "hpc_user", "project", "cluster", "subcluster", "start_time", "cluster_partition", "array_job_id", "num_nodes", "num_hwthreads", "num_acc", "exclusive", "monitoring_status", "smt", "job_state", - "duration", "walltime", "resources", "footprint", "energy", + "id", "job_id", "hpc_user", "project", "cluster", "subcluster", "start_time", + "cluster_partition", "array_job_id", "num_nodes", "num_hwthreads", "num_acc", + "exclusive", "monitoring_status", "smt", "job_state", "duration", "walltime", + "resources", "footprint", "energy", } func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 44a08e0..621e20c 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -9,15 +9,20 @@ import ( "embed" "fmt" "io/fs" + "os" "path/filepath" "strings" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" ) -const tagType = "app" +const ( + tagType = "app" + appPath = "./var/tagger/apps" +) //go:embed apps/* var appFiles embed.FS @@ -31,37 +36,60 @@ type AppTagger struct { apps map[string]appInfo } -func (t *AppTagger) scanApps(files []fs.DirEntry) error { +func (t *AppTagger) scanApp(f fs.File, fns string) { + scanner := bufio.NewScanner(f) + ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)} + + for scanner.Scan() { + ai.strings = append(ai.strings, scanner.Text()) + } + delete(t.apps, ai.tag) + t.apps[ai.tag] = ai +} + +func (t *AppTagger) EventMatch(s string) bool { + return strings.Contains(s, "apps") +} + +func (t *AppTagger) EventCallback() { + files, err := os.ReadDir(appPath) + if err != nil { + log.Fatal(err) + } + for _, fn := range files { fns := fn.Name() log.Debugf("Process: %s", fns) - f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + f, err := os.Open(fmt.Sprintf("%s/%s", appPath, fns)) if err != nil { - return fmt.Errorf("error opening app file %s: %#v", fns, err) + log.Errorf("error opening app file %s: %#v", fns, err) } - scanner := bufio.NewScanner(f) - ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)} - - for scanner.Scan() { - ai.strings = append(ai.strings, scanner.Text()) - } - delete(t.apps, ai.tag) - t.apps[ai.tag] = ai + t.scanApp(f, fns) } - return nil } -// func (t *AppTagger) Reload() error { -// -// } - func (t *AppTagger) Register() error { files, err := appFiles.ReadDir("apps") if err != nil { return fmt.Errorf("error reading app folder: %#v", err) } t.apps = make(map[string]appInfo, 0) - return t.scanApps(files) + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + if err != nil { + return fmt.Errorf("error opening app file %s: %#v", fns, err) + } + t.scanApp(f, fns) + } + + if util.CheckFileExists(appPath) { + log.Infof("Setup file watch for %s", appPath) + util.AddListener(appPath, t) + } + + return nil } func (t *AppTagger) Match(job *schema.Job) { diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index 4fbbc9e..b336125 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -36,7 +36,6 @@ func Init() { tagger.Register() } - // jobTagger.stopTaggers = make([]Tagger, 0) repository.RegisterJobJook(jobTagger) }) } diff --git a/internal/util/fswatcher.go b/internal/util/fswatcher.go new file mode 100644 index 0000000..aaf3372 --- /dev/null +++ b/internal/util/fswatcher.go @@ -0,0 +1,73 @@ +// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package util + +import ( + "sync" + + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/fsnotify/fsnotify" +) + +type Listener interface { + EventCallback() + EventMatch(event string) bool +} + +var ( + initOnce sync.Once + w *fsnotify.Watcher + listeners []Listener +) + +func AddListener(path string, l Listener) { + var err error + + initOnce.Do(func() { + var err error + w, err = fsnotify.NewWatcher() + if err != nil { + log.Error("creating a new watcher: %w", err) + } + listeners = make([]Listener, 0) + + go watchLoop(w) + }) + + listeners = append(listeners, l) + err = w.Add(path) + if err != nil { + log.Warnf("%q: %s", path, err) + } +} + +func FsWatcherShutdown() { + w.Close() +} + +func watchLoop(w *fsnotify.Watcher) { + for { + select { + // Read from Errors. + case err, ok := <-w.Errors: + if !ok { // Channel was closed (i.e. Watcher.Close() was called). + return + } + log.Errorf("watch event loop: %s", err) + // Read from Events. + case e, ok := <-w.Events: + if !ok { // Channel was closed (i.e. Watcher.Close() was called). + return + } + + log.Infof("Event %s", e) + for _, l := range listeners { + if l.EventMatch(e.String()) { + l.EventCallback() + } + } + } + } +} From 9abc206d1af849d86e737c5219f6ab248e830e50 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 20 May 2025 07:10:15 +0200 Subject: [PATCH 18/30] Read in tagger config on startup. Safeguard watcher shutdown --- internal/tagger/detectApp.go | 2 ++ internal/util/fswatcher.go | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 621e20c..d3d797d 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -51,6 +51,7 @@ func (t *AppTagger) EventMatch(s string) bool { return strings.Contains(s, "apps") } +// FIXME: Only process the file that caused the event func (t *AppTagger) EventCallback() { files, err := os.ReadDir(appPath) if err != nil { @@ -85,6 +86,7 @@ func (t *AppTagger) Register() error { } if util.CheckFileExists(appPath) { + t.EventCallback() log.Infof("Setup file watch for %s", appPath) util.AddListener(appPath, t) } diff --git a/internal/util/fswatcher.go b/internal/util/fswatcher.go index aaf3372..5d13462 100644 --- a/internal/util/fswatcher.go +++ b/internal/util/fswatcher.go @@ -44,7 +44,9 @@ func AddListener(path string, l Listener) { } func FsWatcherShutdown() { - w.Close() + if w != nil { + w.Close() + } } func watchLoop(w *fsnotify.Watcher) { From ca634bb70741697f7f34cd21df998d1100b2ff76 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 22 May 2025 07:10:41 +0200 Subject: [PATCH 19/30] Refactor taggers. Refine Job Hooks. Start job classifier --- go.mod | 1 + go.sum | 2 + internal/api/rest.go | 2 - internal/archiver/archiveWorker.go | 7 ++ internal/repository/jobHooks.go | 4 +- internal/tagger/classifyJob.go | 121 +++++++++++++++++++++ internal/tagger/detectApp.go | 35 +++--- internal/tagger/detectApp_test.go | 8 +- internal/tagger/jobclasses/highload.json | 38 +++++++ internal/tagger/jobclasses/highmem.json | 40 +++++++ internal/tagger/jobclasses/lowgpuload.json | 36 ++++++ internal/tagger/jobclasses/lowload.json | 38 +++++++ internal/tagger/tagger.go | 6 +- pkg/archive/fsBackend.go | 39 +------ 14 files changed, 316 insertions(+), 61 deletions(-) create mode 100644 internal/tagger/classifyJob.go create mode 100644 internal/tagger/jobclasses/highload.json create mode 100644 internal/tagger/jobclasses/highmem.json create mode 100644 internal/tagger/jobclasses/lowgpuload.json create mode 100644 internal/tagger/jobclasses/lowload.json diff --git a/go.mod b/go.mod index f17ec18..062ee3e 100644 --- a/go.mod +++ b/go.mod @@ -43,6 +43,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect + github.com/expr-lang/expr v1.17.3 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect diff --git a/go.sum b/go.sum index 57b1649..b4c3781 100644 --- a/go.sum +++ b/go.sum @@ -53,6 +53,8 @@ github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/expr-lang/expr v1.17.3 h1:myeTTuDFz7k6eFe/JPlep/UsiIjVhG61FMHFu63U7j0= +github.com/expr-lang/expr v1.17.3/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= diff --git a/internal/api/rest.go b/internal/api/rest.go index 6133a5e..fe35942 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -1126,8 +1126,6 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo return } - repository.CallJobStopHooks(job) - // Trigger async archiving archiver.TriggerArchiving(job) } diff --git a/internal/archiver/archiveWorker.go b/internal/archiver/archiveWorker.go index 628e36e..6e514cb 100644 --- a/internal/archiver/archiveWorker.go +++ b/internal/archiver/archiveWorker.go @@ -72,7 +72,14 @@ func archivingWorker() { } log.Debugf("archiving job %d took %s", job.JobID, time.Since(start)) log.Printf("archiving job (dbid: %d) successful", job.ID) + + id := job.ID + jobMeta.ID = &id + + repository.CallJobStopHooks(jobMeta) archivePending.Done() + default: + continue } } } diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index 1016335..49535f7 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -12,7 +12,7 @@ import ( type JobHook interface { JobStartCallback(job *schema.Job) - JobStopCallback(job *schema.Job) + JobStopCallback(job *schema.JobMeta) } var ( @@ -44,7 +44,7 @@ func CallJobStartHooks(jobs []*schema.Job) { } } -func CallJobStopHooks(job *schema.Job) { +func CallJobStopHooks(job *schema.JobMeta) { if hooks == nil { return } diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go new file mode 100644 index 0000000..ec1e843 --- /dev/null +++ b/internal/tagger/classifyJob.go @@ -0,0 +1,121 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "bytes" + "embed" + "fmt" + "io/fs" + "os" + "path/filepath" + "strings" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/util" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + "github.com/expr-lang/expr" + "github.com/expr-lang/expr/vm" +) + +//go:embed jobclasses/* +var jobclassFiles embed.FS + +type ruleInfo struct { + tag string + rule *vm.Program +} + +type JobClassTagger struct { + rules map[string]ruleInfo + tagType string + cfgPath string +} + +func (t *JobClassTagger) compileRule(f fs.File, fns string) { + buf := new(bytes.Buffer) + _, err := buf.ReadFrom(f) + if err != nil { + log.Errorf("error reading rule file %s: %#v", fns, err) + } + prg, err := expr.Compile(buf.String(), expr.AsBool()) + if err != nil { + log.Errorf("error compiling rule %s: %#v", fns, err) + } + ri := ruleInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), rule: prg} + + delete(t.rules, ri.tag) + t.rules[ri.tag] = ri +} + +func (t *JobClassTagger) EventMatch(s string) bool { + return strings.Contains(s, "jobclasses") +} + +// FIXME: Only process the file that caused the event +func (t *JobClassTagger) EventCallback() { + files, err := os.ReadDir(t.cfgPath) + if err != nil { + log.Fatal(err) + } + + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns)) + if err != nil { + log.Errorf("error opening app file %s: %#v", fns, err) + } + t.compileRule(f, fns) + } +} + +func (t *JobClassTagger) Register() error { + t.cfgPath = "./var/tagger/jobclasses" + t.tagType = "jobClass" + + files, err := appFiles.ReadDir("jobclasses") + if err != nil { + return fmt.Errorf("error reading app folder: %#v", err) + } + t.rules = make(map[string]ruleInfo, 0) + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + if err != nil { + return fmt.Errorf("error opening app file %s: %#v", fns, err) + } + defer f.Close() + t.compileRule(f, fns) + } + + if util.CheckFileExists(t.cfgPath) { + t.EventCallback() + log.Infof("Setup file watch for %s", t.cfgPath) + util.AddListener(t.cfgPath, t) + } + + return nil +} + +func (t *JobClassTagger) Match(job *schema.JobMeta) { + r := repository.GetJobRepository() + + for _, ri := range t.rules { + tag := ri.tag + output, err := expr.Run(ri.rule, job) + if err != nil { + log.Errorf("error running rule %s: %#v", tag, err) + } + if output.(bool) { + id := job.ID + if !r.HasTag(*id, t.tagType, tag) { + r.AddTagOrCreateDirect(*id, t.tagType, tag) + } + } + } +} diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index d3d797d..8057aad 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -19,11 +19,6 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/schema" ) -const ( - tagType = "app" - appPath = "./var/tagger/apps" -) - //go:embed apps/* var appFiles embed.FS @@ -33,7 +28,9 @@ type appInfo struct { } type AppTagger struct { - apps map[string]appInfo + apps map[string]appInfo + tagType string + cfgPath string } func (t *AppTagger) scanApp(f fs.File, fns string) { @@ -53,7 +50,7 @@ func (t *AppTagger) EventMatch(s string) bool { // FIXME: Only process the file that caused the event func (t *AppTagger) EventCallback() { - files, err := os.ReadDir(appPath) + files, err := os.ReadDir(t.cfgPath) if err != nil { log.Fatal(err) } @@ -61,7 +58,7 @@ func (t *AppTagger) EventCallback() { for _, fn := range files { fns := fn.Name() log.Debugf("Process: %s", fns) - f, err := os.Open(fmt.Sprintf("%s/%s", appPath, fns)) + f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns)) if err != nil { log.Errorf("error opening app file %s: %#v", fns, err) } @@ -70,6 +67,9 @@ func (t *AppTagger) EventCallback() { } func (t *AppTagger) Register() error { + t.cfgPath = "./var/tagger/apps" + t.tagType = "app" + files, err := appFiles.ReadDir("apps") if err != nil { return fmt.Errorf("error reading app folder: %#v", err) @@ -79,28 +79,25 @@ func (t *AppTagger) Register() error { fns := fn.Name() log.Debugf("Process: %s", fns) f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + defer f.Close() if err != nil { return fmt.Errorf("error opening app file %s: %#v", fns, err) } t.scanApp(f, fns) } - if util.CheckFileExists(appPath) { + if util.CheckFileExists(t.cfgPath) { t.EventCallback() - log.Infof("Setup file watch for %s", appPath) - util.AddListener(appPath, t) + log.Infof("Setup file watch for %s", t.cfgPath) + util.AddListener(t.cfgPath, t) } return nil } -func (t *AppTagger) Match(job *schema.Job) { +func (t *AppTagger) Match(job *schema.JobMeta) { r := repository.GetJobRepository() - meta, err := r.FetchMetadata(job) - if err != nil { - log.Error("cannot fetch meta data") - } - jobscript, ok := meta["jobScript"] + jobscript, ok := job.MetaData["jobScript"] if ok { id := job.ID @@ -109,8 +106,8 @@ func (t *AppTagger) Match(job *schema.Job) { tag := a.tag for _, s := range a.strings { if strings.Contains(jobscript, s) { - if !r.HasTag(id, tagType, tag) { - r.AddTagOrCreateDirect(id, tagType, tag) + if !r.HasTag(*id, t.tagType, tag) { + r.AddTagOrCreateDirect(*id, t.tagType, tag) break out } } diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go index 3b43cce..56bd856 100644 --- a/internal/tagger/detectApp_test.go +++ b/internal/tagger/detectApp_test.go @@ -9,6 +9,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" ) func setup(tb testing.TB) *repository.JobRepository { @@ -51,7 +52,12 @@ func TestMatch(t *testing.T) { err = tagger.Register() noErr(t, err) - tagger.Match(job) + jobMeta := &schema.JobMeta{ + ID: &job.ID, + BaseJob: job.BaseJob, + StartTime: job.StartTime.Unix(), + } + tagger.Match(jobMeta) if !r.HasTag(5, "app", "vasp") { t.Errorf("missing tag vasp") diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json new file mode 100644 index 0000000..a65f400 --- /dev/null +++ b/internal/tagger/jobclasses/highload.json @@ -0,0 +1,38 @@ +{ + "name": "Excessive CPU load", + "tag": "excessiveload", + "comment": "Assumptions: all nodes have the same number of cores.", + "parameters": [ + "excessivecpuload_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": [ + "cpu_load" + ], + "requirements": [ + "job.exclusive == 1", + "job.duration > job_min_duration_seconds", + "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + ], + "terms": [ + { + "load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')" + }, + { + "load_threshold": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor" + }, + { + "highload_nodes": "load_mean > load_threshold" + }, + { + "highload": "highload_nodes.any('all')" + }, + { + "load_perc": "load_mean / load_threshold" + } + ], + "output": "highload", + "output_scalar": "load_perc", + "template": "Job ({{ job.jobId }})\nThis job was detected as excessiveload because the mean cpu load {{ load_mean.array }} falls above the threshold {{ load_threshold }}." +} diff --git a/internal/tagger/jobclasses/highmem.json b/internal/tagger/jobclasses/highmem.json new file mode 100644 index 0000000..69ffcf3 --- /dev/null +++ b/internal/tagger/jobclasses/highmem.json @@ -0,0 +1,40 @@ +{ + "name": "High memory usage", + "tag": "high_memory_load", + "parameters": [ + "high_memory_load_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": [ + "mem_used" + ], + "requirements": [ + "job.duration > job_min_duration_seconds", + "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds", + "hasattr(job, \"allocated_memory\")" + ], + "terms": [ + { + "memory_alloc": "job.allocated_memory" + }, + { + "memory_used": "mem_used.max('time')" + }, + { + "load_threshold": "memory_alloc * high_memory_load_threshold_factor" + }, + { + "high_mem_nodes": "memory_used > load_threshold" + }, + { + "high_mem": "high_mem_nodes.any('all')" + }, + { + "load_perc": "memory_used / (memory_alloc * high_memory_load_threshold_factor)" + } + ], + "output": "high_mem", + "output_scalar": "load_perc", + "template": "Job ({{ job.jobId }})\nThis job was detected as high_memory_load because the memory usage {{ high_mem_nodes.array }} falls above the threshold {{ load_threshold }}." +} diff --git a/internal/tagger/jobclasses/lowgpuload.json b/internal/tagger/jobclasses/lowgpuload.json new file mode 100644 index 0000000..80339b2 --- /dev/null +++ b/internal/tagger/jobclasses/lowgpuload.json @@ -0,0 +1,36 @@ +{ + "name": "Low GPU load", + "tag": "lowgpuload", + "parameters": [ + "lowgpuload_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": [ + "nv_util" + ], + "requirements": [ + "job.duration > job_min_duration_seconds", + "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + ], + "terms": [ + { + "load_mean": "nv_util.mean('all')" + }, + { + "load_threshold": "job.numAcc * lowgpuload_threshold_factor" + }, + { + "lowload_nodes": "load_mean < load_threshold" + }, + { + "lowload": "lowload_nodes.any('all')" + }, + { + "load_perc": "1.0 - (load_mean / load_threshold)" + } + ], + "output": "lowload", + "output_scalar": "load_perc", + "template": "Job ({{ job.jobId }})\nThis job was detected as lowgpuload because the mean gpu load {{ load_mean }} falls below the threshold {{ load_threshold }}." +} diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json new file mode 100644 index 0000000..e860361 --- /dev/null +++ b/internal/tagger/jobclasses/lowload.json @@ -0,0 +1,38 @@ +{ + "name": "Low CPU load", + "tag": "lowload", + "parameters": [ + "lowcpuload_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": [ + "cpu_load" + ], + "requirements": [ + "job.exclusive == 1", + "job.duration > job_min_duration_seconds", + "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + ], + "tagRule": [ + { + "load_mean": "cpu_load[cpu_load_pre_cutoff_samples:].mean('all')" + }, + { + "load_threshold": "job.numHwthreads * lowcpuload_threshold_factor" + }, + { + "lowload_nodes": "load_mean < load_threshold" + }, + { + "lowload": "lowload_nodes.any('all')" + }, + { + "load_perc": "1.0 - (load_mean / load_threshold)" + } + ], + "valueRule": [], + "output": "lowload", + "output_scalar": "load_perc", + "hint": "Job ({{ job.jobId }})\nThis job was detected as lowload because the mean cpu load {{ load_mean }} falls below the threshold {{ load_threshold }}." +} diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index b336125..d5e42b1 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -13,7 +13,7 @@ import ( type Tagger interface { Register() error - Match(job *schema.Job) + Match(job *schema.JobMeta) } var ( @@ -31,6 +31,8 @@ func Init() { jobTagger = &JobTagger{} jobTagger.startTaggers = make([]Tagger, 0) jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) + jobTagger.stopTaggers = make([]Tagger, 0) + jobTagger.stopTaggers = append(jobTagger.startTaggers, &JobClassTagger{}) for _, tagger := range jobTagger.startTaggers { tagger.Register() @@ -46,5 +48,5 @@ func (jt *JobTagger) JobStartCallback(job *schema.Job) { } } -func (jt *JobTagger) JobStopCallback(job *schema.Job) { +func (jt *JobTagger) JobStopCallback(job *schema.JobMeta) { } diff --git a/pkg/archive/fsBackend.go b/pkg/archive/fsBackend.go index 711b1f5..a59b663 100644 --- a/pkg/archive/fsBackend.go +++ b/pkg/archive/fsBackend.go @@ -59,14 +59,13 @@ func getDirectory( func getPath( job *schema.Job, rootPath string, - file string) string { - + file string, +) string { return filepath.Join( getDirectory(job, rootPath), file) } func loadJobMeta(filename string) (*schema.JobMeta, error) { - b, err := os.ReadFile(filename) if err != nil { log.Errorf("loadJobMeta() > open file error: %v", err) @@ -83,7 +82,6 @@ func loadJobMeta(filename string) (*schema.JobMeta, error) { func loadJobData(filename string, isCompressed bool) (schema.JobData, error) { f, err := os.Open(filename) - if err != nil { log.Errorf("fsBackend LoadJobData()- %v", err) return nil, err @@ -117,7 +115,6 @@ func loadJobData(filename string, isCompressed bool) (schema.JobData, error) { func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, error) { f, err := os.Open(filename) - if err != nil { log.Errorf("fsBackend LoadJobStats()- %v", err) return nil, err @@ -150,7 +147,6 @@ func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, er } func (fsa *FsArchive) Init(rawConfig json.RawMessage) (uint64, error) { - var config FsArchiveConfig if err := json.Unmarshal(rawConfig, &config); err != nil { log.Warnf("Init() > Unmarshal error: %#v", err) @@ -276,7 +272,6 @@ func (fsa *FsArchive) Exists(job *schema.Job) bool { } func (fsa *FsArchive) Clean(before int64, after int64) { - if after == 0 { after = math.MaxInt64 } @@ -392,7 +387,6 @@ func (fsa *FsArchive) Compress(jobs []*schema.Job) { } func (fsa *FsArchive) CompressLast(starttime int64) int64 { - filename := filepath.Join(fsa.path, "compress.txt") b, err := os.ReadFile(filename) if err != nil { @@ -441,7 +435,6 @@ func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) { } func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) { - b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json")) if err != nil { log.Errorf("LoadClusterCfg() > open file error: %v", err) @@ -456,7 +449,6 @@ func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) { } func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { - ch := make(chan JobContainer) go func() { clustersDir, err := os.ReadDir(fsa.path) @@ -527,7 +519,6 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { } func (fsa *FsArchive) StoreJobMeta(jobMeta *schema.JobMeta) error { - job := schema.Job{ BaseJob: jobMeta.BaseJob, StartTime: time.Unix(jobMeta.StartTime, 0), @@ -556,8 +547,8 @@ func (fsa *FsArchive) GetClusters() []string { func (fsa *FsArchive) ImportJob( jobMeta *schema.JobMeta, - jobData *schema.JobData) error { - + jobData *schema.JobData, +) error { job := schema.Job{ BaseJob: jobMeta.BaseJob, StartTime: time.Unix(jobMeta.StartTime, 0), @@ -583,28 +574,6 @@ func (fsa *FsArchive) ImportJob( return err } - // var isCompressed bool = true - // // TODO Use shortJob Config for check - // if jobMeta.Duration < 300 { - // isCompressed = false - // f, err = os.Create(path.Join(dir, "data.json")) - // } else { - // f, err = os.Create(path.Join(dir, "data.json.gz")) - // } - // if err != nil { - // return err - // } - // - // if isCompressed { - // if err := EncodeJobData(gzip.NewWriter(f), jobData); err != nil { - // return err - // } - // } else { - // if err := EncodeJobData(f, jobData); err != nil { - // return err - // } - // } - f, err = os.Create(path.Join(dir, "data.json")) if err != nil { log.Error("Error while creating filepath for data.json") From 733e3ea9d584592d8a78a7244bb2a0a678ffcb06 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 23 May 2025 07:48:27 +0200 Subject: [PATCH 20/30] Revert interface from jobMeta to job type. Extend job classifier tagger. Cleanup test rules. --- internal/archiver/archiveWorker.go | 5 +- internal/repository/jobHooks.go | 4 +- internal/tagger/classifyJob.go | 193 +++++++++++++++++---- internal/tagger/detectApp.go | 6 +- internal/tagger/detectApp_test.go | 8 +- internal/tagger/jobclasses/highload.json | 21 +-- internal/tagger/jobclasses/highmem.json | 40 ----- internal/tagger/jobclasses/lowgpuload.json | 36 ---- internal/tagger/jobclasses/lowload.json | 26 +-- internal/tagger/jobclasses/parameters.json | 14 ++ internal/tagger/tagger.go | 7 +- 11 files changed, 202 insertions(+), 158 deletions(-) delete mode 100644 internal/tagger/jobclasses/highmem.json delete mode 100644 internal/tagger/jobclasses/lowgpuload.json create mode 100644 internal/tagger/jobclasses/parameters.json diff --git a/internal/archiver/archiveWorker.go b/internal/archiver/archiveWorker.go index 6e514cb..42a60b9 100644 --- a/internal/archiver/archiveWorker.go +++ b/internal/archiver/archiveWorker.go @@ -73,10 +73,7 @@ func archivingWorker() { log.Debugf("archiving job %d took %s", job.JobID, time.Since(start)) log.Printf("archiving job (dbid: %d) successful", job.ID) - id := job.ID - jobMeta.ID = &id - - repository.CallJobStopHooks(jobMeta) + repository.CallJobStopHooks(job) archivePending.Done() default: continue diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index 49535f7..1016335 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -12,7 +12,7 @@ import ( type JobHook interface { JobStartCallback(job *schema.Job) - JobStopCallback(job *schema.JobMeta) + JobStopCallback(job *schema.Job) } var ( @@ -44,7 +44,7 @@ func CallJobStartHooks(jobs []*schema.Job) { } } -func CallJobStopHooks(job *schema.JobMeta) { +func CallJobStopHooks(job *schema.Job) { if hooks == nil { return } diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index ec1e843..f7195e3 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -7,14 +7,16 @@ package tagger import ( "bytes" "embed" + "encoding/json" "fmt" - "io/fs" + "maps" "os" - "path/filepath" "strings" + "text/template" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/util" + "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/expr-lang/expr" @@ -24,31 +26,100 @@ import ( //go:embed jobclasses/* var jobclassFiles embed.FS +type Variable struct { + Name string `json:"name"` + Expr string `json:"expr"` +} + +type ruleVariable struct { + name string + expr *vm.Program +} + +type RuleFormat struct { + Name string `json:"name"` + Tag string `json:"tag"` + Parameters []string `json:"parameters"` + Metrics []string `json:"metrics"` + Requirements []string `json:"requirements"` + Variables []Variable `json:"variables"` + Rule string `json:"rule"` + Hint string `json:"hint"` +} + type ruleInfo struct { - tag string - rule *vm.Program + env map[string]any + metrics []string + requirements []*vm.Program + variables []ruleVariable + rule *vm.Program + hint *template.Template } type JobClassTagger struct { - rules map[string]ruleInfo - tagType string - cfgPath string + rules map[string]ruleInfo + parameters map[string]any + tagType string + cfgPath string } -func (t *JobClassTagger) compileRule(f fs.File, fns string) { - buf := new(bytes.Buffer) - _, err := buf.ReadFrom(f) +func (t *JobClassTagger) prepareRule(filename string, fns string) { + b, err := os.ReadFile(filename) if err != nil { - log.Errorf("error reading rule file %s: %#v", fns, err) + log.Warnf("prepareRule() > open file error: %v", err) + return } - prg, err := expr.Compile(buf.String(), expr.AsBool()) + + var rule RuleFormat + if err := json.NewDecoder(bytes.NewReader(b)).Decode(&rule); err != nil { + log.Warn("Error while decoding raw job meta json") + return + } + + ri := ruleInfo{} + ri.env = make(map[string]any) + ri.metrics = make([]string, 0) + ri.requirements = make([]*vm.Program, 0) + ri.variables = make([]ruleVariable, 0) + + // check if all required parameters are available + for _, p := range rule.Parameters { + param, ok := t.parameters[p] + if !ok { + log.Warnf("prepareRule() > missing parameter %s in rule %s", p, fns) + return + } + ri.env[p] = param + } + + // set all required metrics + for _, m := range rule.Metrics { + ri.metrics = append(ri.metrics, m) + } + + // compile requirements + for _, r := range rule.Requirements { + req, err := expr.Compile(r, expr.AsBool()) + if err != nil { + log.Errorf("error compiling requirement %s: %#v", r, err) + return + } + ri.requirements = append(ri.requirements, req) + } + + // compile rule + exp, err := expr.Compile(rule.Rule, expr.AsBool()) if err != nil { log.Errorf("error compiling rule %s: %#v", fns, err) + return } - ri := ruleInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), rule: prg} + ri.rule = exp - delete(t.rules, ri.tag) - t.rules[ri.tag] = ri + // prepare hint template + ri.hint = template.Must(template.New(fns).Parse(rule.Hint)) + + delete(t.rules, rule.Tag) + t.rules[rule.Tag] = ri } func (t *JobClassTagger) EventMatch(s string) bool { @@ -65,11 +136,8 @@ func (t *JobClassTagger) EventCallback() { for _, fn := range files { fns := fn.Name() log.Debugf("Process: %s", fns) - f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns)) - if err != nil { - log.Errorf("error opening app file %s: %#v", fns, err) - } - t.compileRule(f, fns) + filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + t.prepareRule(filename, fns) } } @@ -84,13 +152,23 @@ func (t *JobClassTagger) Register() error { t.rules = make(map[string]ruleInfo, 0) for _, fn := range files { fns := fn.Name() - log.Debugf("Process: %s", fns) - f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) - if err != nil { - return fmt.Errorf("error opening app file %s: %#v", fns, err) + filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + + if fn.Name() == "parameters.json" { + b, err := os.ReadFile(filename) + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return err + } + + if err := json.NewDecoder(bytes.NewReader(b)).Decode(&t.parameters); err != nil { + log.Warn("Error while decoding parameters.json") + return err + } + continue } - defer f.Close() - t.compileRule(f, fns) + log.Debugf("Process: %s", fns) + t.prepareRule(filename, fns) } if util.CheckFileExists(t.cfgPath) { @@ -102,20 +180,69 @@ func (t *JobClassTagger) Register() error { return nil } -func (t *JobClassTagger) Match(job *schema.JobMeta) { +func (t *JobClassTagger) Match(job *schema.Job) { r := repository.GetJobRepository() + jobstats, err := archive.GetStatistics(job) + if err != nil { + log.Errorf("job classification failed for job %d: %#v", job.JobID, err) + return + } - for _, ri := range t.rules { - tag := ri.tag - output, err := expr.Run(ri.rule, job) + for tag, ri := range t.rules { + env := make(map[string]any) + maps.Copy(env, ri.env) + + // add metrics to env + for _, m := range ri.metrics { + stats, ok := jobstats[m] + if !ok { + log.Errorf("job classification failed for job %d: missing metric '%s'", job.JobID, m) + return + } + env[m] = stats.Avg + } + + // check rule requirements apply + for _, r := range ri.requirements { + ok, err := expr.Run(r, env) + if err != nil { + log.Errorf("error running requirement for rule %s: %#v", tag, err) + return + } + if !ok.(bool) { + log.Infof("requirement for rule %s not met", tag) + return + } + } + + // validate rule expression + for _, v := range ri.variables { + value, err := expr.Run(v.expr, env) + if err != nil { + log.Errorf("error running rule %s: %#v", tag, err) + return + } + env[v.name] = value + } + + match, err := expr.Run(ri.rule, job) if err != nil { log.Errorf("error running rule %s: %#v", tag, err) } - if output.(bool) { + if match.(bool) { id := job.ID - if !r.HasTag(*id, t.tagType, tag) { - r.AddTagOrCreateDirect(*id, t.tagType, tag) + if !r.HasTag(id, t.tagType, tag) { + r.AddTagOrCreateDirect(id, t.tagType, tag) } } + + // process hint template + var msg bytes.Buffer + if err := ri.hint.Execute(&msg, env); err != nil { + log.Errorf("Template error: %s", err.Error()) + } + + // FIXME: Handle case where multiple tags apply + r.UpdateMetadata(job, "message", msg.String()) } } diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 8057aad..a37924e 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -95,7 +95,7 @@ func (t *AppTagger) Register() error { return nil } -func (t *AppTagger) Match(job *schema.JobMeta) { +func (t *AppTagger) Match(job *schema.Job) { r := repository.GetJobRepository() jobscript, ok := job.MetaData["jobScript"] if ok { @@ -106,8 +106,8 @@ func (t *AppTagger) Match(job *schema.JobMeta) { tag := a.tag for _, s := range a.strings { if strings.Contains(jobscript, s) { - if !r.HasTag(*id, t.tagType, tag) { - r.AddTagOrCreateDirect(*id, t.tagType, tag) + if !r.HasTag(id, t.tagType, tag) { + r.AddTagOrCreateDirect(id, t.tagType, tag) break out } } diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go index 56bd856..3b43cce 100644 --- a/internal/tagger/detectApp_test.go +++ b/internal/tagger/detectApp_test.go @@ -9,7 +9,6 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/log" - "github.com/ClusterCockpit/cc-backend/pkg/schema" ) func setup(tb testing.TB) *repository.JobRepository { @@ -52,12 +51,7 @@ func TestMatch(t *testing.T) { err = tagger.Register() noErr(t, err) - jobMeta := &schema.JobMeta{ - ID: &job.ID, - BaseJob: job.BaseJob, - StartTime: job.StartTime.Unix(), - } - tagger.Match(jobMeta) + tagger.Match(job) if !r.HasTag(5, "app", "vasp") { t.Errorf("missing tag vasp") diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index a65f400..29d4026 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -12,27 +12,22 @@ ], "requirements": [ "job.exclusive == 1", - "job.duration > job_min_duration_seconds", - "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + "job.duration > job_min_duration_seconds" ], "terms": [ { + "name": "", "load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')" }, { - "load_threshold": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor" + "name": "load_threshold", + "expr": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor" }, { - "highload_nodes": "load_mean > load_threshold" - }, - { - "highload": "highload_nodes.any('all')" - }, - { - "load_perc": "load_mean / load_threshold" + "name": "load_perc", + "expr": "load_mean / load_threshold" } ], - "output": "highload", - "output_scalar": "load_perc", - "template": "Job ({{ job.jobId }})\nThis job was detected as excessiveload because the mean cpu load {{ load_mean.array }} falls above the threshold {{ load_threshold }}." + "rule": "cpu_load > load_threshold", + "hint": "This job was detected as excessiveload because the average cpu load {{ cpu_load }} falls above the threshold {{ load_threshold }}." } diff --git a/internal/tagger/jobclasses/highmem.json b/internal/tagger/jobclasses/highmem.json deleted file mode 100644 index 69ffcf3..0000000 --- a/internal/tagger/jobclasses/highmem.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "name": "High memory usage", - "tag": "high_memory_load", - "parameters": [ - "high_memory_load_threshold_factor", - "job_min_duration_seconds", - "sampling_interval_seconds" - ], - "metrics": [ - "mem_used" - ], - "requirements": [ - "job.duration > job_min_duration_seconds", - "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds", - "hasattr(job, \"allocated_memory\")" - ], - "terms": [ - { - "memory_alloc": "job.allocated_memory" - }, - { - "memory_used": "mem_used.max('time')" - }, - { - "load_threshold": "memory_alloc * high_memory_load_threshold_factor" - }, - { - "high_mem_nodes": "memory_used > load_threshold" - }, - { - "high_mem": "high_mem_nodes.any('all')" - }, - { - "load_perc": "memory_used / (memory_alloc * high_memory_load_threshold_factor)" - } - ], - "output": "high_mem", - "output_scalar": "load_perc", - "template": "Job ({{ job.jobId }})\nThis job was detected as high_memory_load because the memory usage {{ high_mem_nodes.array }} falls above the threshold {{ load_threshold }}." -} diff --git a/internal/tagger/jobclasses/lowgpuload.json b/internal/tagger/jobclasses/lowgpuload.json deleted file mode 100644 index 80339b2..0000000 --- a/internal/tagger/jobclasses/lowgpuload.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "name": "Low GPU load", - "tag": "lowgpuload", - "parameters": [ - "lowgpuload_threshold_factor", - "job_min_duration_seconds", - "sampling_interval_seconds" - ], - "metrics": [ - "nv_util" - ], - "requirements": [ - "job.duration > job_min_duration_seconds", - "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" - ], - "terms": [ - { - "load_mean": "nv_util.mean('all')" - }, - { - "load_threshold": "job.numAcc * lowgpuload_threshold_factor" - }, - { - "lowload_nodes": "load_mean < load_threshold" - }, - { - "lowload": "lowload_nodes.any('all')" - }, - { - "load_perc": "1.0 - (load_mean / load_threshold)" - } - ], - "output": "lowload", - "output_scalar": "load_perc", - "template": "Job ({{ job.jobId }})\nThis job was detected as lowgpuload because the mean gpu load {{ load_mean }} falls below the threshold {{ load_threshold }}." -} diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json index e860361..3c5bd4d 100644 --- a/internal/tagger/jobclasses/lowload.json +++ b/internal/tagger/jobclasses/lowload.json @@ -11,28 +11,18 @@ ], "requirements": [ "job.exclusive == 1", - "job.duration > job_min_duration_seconds", - "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + "job.duration > job_min_duration_seconds" ], - "tagRule": [ + "variables": [ { - "load_mean": "cpu_load[cpu_load_pre_cutoff_samples:].mean('all')" + "name": "load_threshold", + "expr": "job.numHwthreads * lowcpuload_threshold_factor" }, { - "load_threshold": "job.numHwthreads * lowcpuload_threshold_factor" - }, - { - "lowload_nodes": "load_mean < load_threshold" - }, - { - "lowload": "lowload_nodes.any('all')" - }, - { - "load_perc": "1.0 - (load_mean / load_threshold)" + "name": "load_perc", + "expr": "1.0 - (cpu_load / load_threshold)" } ], - "valueRule": [], - "output": "lowload", - "output_scalar": "load_perc", - "hint": "Job ({{ job.jobId }})\nThis job was detected as lowload because the mean cpu load {{ load_mean }} falls below the threshold {{ load_threshold }}." + "rule": "cpu_load < load_threshold", + "hint": "This job was detected as lowload because the average cpu load {{ cpu_load }} falls below the threshold {{ load_threshold }}." } diff --git a/internal/tagger/jobclasses/parameters.json b/internal/tagger/jobclasses/parameters.json new file mode 100644 index 0000000..39e94c1 --- /dev/null +++ b/internal/tagger/jobclasses/parameters.json @@ -0,0 +1,14 @@ +{ + "lowcpuload_threshold_factor": 0.9, + "excessivecpuload_threshold_factor": 1.1, + "highmemoryusage_threshold_factor": 0.9, + "node_load_imbalance_threshold_factor": 0.1, + "core_load_imbalance_threshold_factor": 0.1, + "high_memory_load_threshold_factor": 0.9, + "lowgpuload_threshold_factor": 0.7, + "memory_leak_slope_threshold": 0.1, + "job_min_duration_seconds": 600.0, + "sampling_interval_seconds": 30.0, + "cpu_load_pre_cutoff_samples": 11.0, + "cpu_load_core_pre_cutoff_samples": 6.0 +} diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index d5e42b1..ffdd011 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -13,7 +13,7 @@ import ( type Tagger interface { Register() error - Match(job *schema.JobMeta) + Match(job *schema.Job) } var ( @@ -48,5 +48,8 @@ func (jt *JobTagger) JobStartCallback(job *schema.Job) { } } -func (jt *JobTagger) JobStopCallback(job *schema.JobMeta) { +func (jt *JobTagger) JobStopCallback(job *schema.Job) { + for _, tagger := range jobTagger.stopTaggers { + tagger.Match(job) + } } From 3c66840f953cdba46a23f7c32e71e6186e830489 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 23 May 2025 10:13:59 +0200 Subject: [PATCH 21/30] Add tagger config option and command line switch to run taggers on all jobs --- cmd/cc-backend/cli.go | 6 ++-- cmd/cc-backend/main.go | 11 ++++++- internal/repository/job.go | 1 + internal/repository/jobFind.go | 29 +++++++++++++++++++ internal/tagger/tagger.go | 53 ++++++++++++++++++++++++++-------- pkg/schema/config.go | 2 ++ 6 files changed, 87 insertions(+), 15 deletions(-) diff --git a/cmd/cc-backend/cli.go b/cmd/cc-backend/cli.go index 8d9e7e6..8b826bb 100644 --- a/cmd/cc-backend/cli.go +++ b/cmd/cc-backend/cli.go @@ -7,8 +7,9 @@ package main import "flag" var ( - flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool - flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string + flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, + flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags bool + flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string ) func cliInit() { @@ -21,6 +22,7 @@ func cliInit() { flag.BoolVar(&flagVersion, "version", false, "Show version information and exit") flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit") flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit") + flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit") flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit") flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages") flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`") diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index cbfccef..cd2d08d 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -213,12 +213,21 @@ func main() { } } + if flagApplyTags { + if err := tagger.RunTaggers(); err != nil { + log.Abortf("Running job taggers.\nError: %s\n", err.Error()) + } + } + if !flagServer { log.Exit("No errors, server flag not set. Exiting cc-backend.") } archiver.Start(repository.GetJobRepository()) - tagger.Init() + + if config.Keys.EnableJobTaggers { + tagger.Init() + } taskManager.Start() serverInit() diff --git a/internal/repository/job.go b/internal/repository/job.go index 73a2588..97ca280 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -472,6 +472,7 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { return nil } +// FIXME: Reconsider filtering short jobs with harcoded threshold func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { query := sq.Select(jobColumns...).From("job"). Where(fmt.Sprintf("job.cluster = '%s'", cluster)). diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index ac09355..614b7c0 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -103,6 +103,35 @@ func (r *JobRepository) FindAll( return jobs, nil } +// Get complete joblist only consisting of db ids. +// This is useful to process large job counts and intended to be used +// together with FindById to process jobs one by one +func (r *JobRepository) GetJobList() ([]int64, error) { + query := sq.Select("id").From("job"). + Where("job.job_state != 'running'") + + rows, err := query.RunWith(r.stmtCache).Query() + if err != nil { + log.Error("Error while running query") + return nil, err + } + + jl := make([]int64, 0, 1000) + for rows.Next() { + var id int64 + err := rows.Scan(&id) + if err != nil { + rows.Close() + log.Warn("Error while scanning rows") + return nil, err + } + jl = append(jl, id) + } + + log.Infof("Return job count %d", len(jl)) + return jl, nil +} + // FindById executes a SQL query to find a specific batch job. // The job is queried using the database id. // It returns a pointer to a schema.Job data structure and an error variable. diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index ffdd011..da32fc4 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -8,6 +8,7 @@ import ( "sync" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" ) @@ -26,30 +27,58 @@ type JobTagger struct { stopTaggers []Tagger } +func newTagger() { + jobTagger = &JobTagger{} + jobTagger.startTaggers = make([]Tagger, 0) + jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) + jobTagger.stopTaggers = make([]Tagger, 0) + jobTagger.stopTaggers = append(jobTagger.startTaggers, &JobClassTagger{}) + + for _, tagger := range jobTagger.startTaggers { + tagger.Register() + } +} + func Init() { initOnce.Do(func() { - jobTagger = &JobTagger{} - jobTagger.startTaggers = make([]Tagger, 0) - jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) - jobTagger.stopTaggers = make([]Tagger, 0) - jobTagger.stopTaggers = append(jobTagger.startTaggers, &JobClassTagger{}) - - for _, tagger := range jobTagger.startTaggers { - tagger.Register() - } - + newTagger() repository.RegisterJobJook(jobTagger) }) } func (jt *JobTagger) JobStartCallback(job *schema.Job) { - for _, tagger := range jobTagger.startTaggers { + for _, tagger := range jt.startTaggers { tagger.Match(job) } } func (jt *JobTagger) JobStopCallback(job *schema.Job) { - for _, tagger := range jobTagger.stopTaggers { + for _, tagger := range jt.stopTaggers { tagger.Match(job) } } + +func RunTaggers() error { + newTagger() + r := repository.GetJobRepository() + jl, err := r.GetJobList() + if err != nil { + log.Errorf("Error while getting job list %s", err) + return err + } + + for _, id := range jl { + job, err := r.FindByIdDirect(id) + if err != nil { + log.Errorf("Error while getting job %s", err) + return err + } + for _, tagger := range jobTagger.startTaggers { + tagger.Match(job) + } + for _, tagger := range jobTagger.stopTaggers { + tagger.Match(job) + } + } + return nil +} diff --git a/pkg/schema/config.go b/pkg/schema/config.go index a5caa61..eda3d91 100644 --- a/pkg/schema/config.go +++ b/pkg/schema/config.go @@ -131,6 +131,8 @@ type ProgramConfig struct { // do not write to the job-archive. DisableArchive bool `json:"disable-archive"` + EnableJobTaggers bool `json:"enable-job-taggers"` + // Validate json input against schema Validate bool `json:"validate"` From f14bdb306845ba8304e8d79418a885ee19983b71 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 13:08:03 +0200 Subject: [PATCH 22/30] Fix bugs in job classifier and tagger infrastructure --- cmd/cc-backend/main.go | 7 +- go.mod | 3 + go.sum | 6 ++ internal/tagger/classifyJob.go | 96 ++++++++++++++++++------ internal/tagger/detectApp.go | 10 ++- internal/tagger/jobclasses/highload.json | 16 ++-- internal/tagger/jobclasses/lowload.json | 8 +- internal/tagger/tagger.go | 6 +- 8 files changed, 105 insertions(+), 47 deletions(-) diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index cd2d08d..ab07d28 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -213,6 +213,10 @@ func main() { } } + if config.Keys.EnableJobTaggers { + tagger.Init() + } + if flagApplyTags { if err := tagger.RunTaggers(); err != nil { log.Abortf("Running job taggers.\nError: %s\n", err.Error()) @@ -225,9 +229,6 @@ func main() { archiver.Start(repository.GetJobRepository()) - if config.Keys.EnableJobTaggers { - tagger.Init() - } taskManager.Start() serverInit() diff --git a/go.mod b/go.mod index 062ee3e..c57d9ed 100644 --- a/go.mod +++ b/go.mod @@ -54,6 +54,8 @@ require ( github.com/go-openapi/swag v0.23.0 // indirect github.com/go-viper/mapstructure/v2 v2.2.1 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/gookit/color v1.5.4 // indirect + github.com/gookit/goutil v0.6.18 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect @@ -80,6 +82,7 @@ require ( github.com/sosodev/duration v1.3.1 // indirect github.com/swaggo/files v1.0.1 // indirect github.com/urfave/cli/v2 v2.27.5 // indirect + github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect go.uber.org/atomic v1.11.0 // indirect golang.org/x/mod v0.23.0 // indirect diff --git a/go.sum b/go.sum index b4c3781..2102888 100644 --- a/go.sum +++ b/go.sum @@ -101,6 +101,10 @@ github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark= github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gookit/color v1.5.4 h1:FZmqs7XOyGgCAxmWyPslpiok1k05wmY3SJTytgvYFs0= +github.com/gookit/color v1.5.4/go.mod h1:pZJOeOS8DM43rXbp4AZo1n9zCU2qjpcRko0b6/QJi9w= +github.com/gookit/goutil v0.6.18 h1:MUVj0G16flubWT8zYVicIuisUiHdgirPAkmnfD2kKgw= +github.com/gookit/goutil v0.6.18/go.mod h1:AY/5sAwKe7Xck+mEbuxj0n/bc3qwrGNe3Oeulln7zBA= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= @@ -241,6 +245,8 @@ github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w= github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ= github.com/vektah/gqlparser/v2 v2.5.22 h1:yaaeJ0fu+nv1vUMW0Hl+aS1eiv1vMfapBNjpffAda1I= github.com/vektah/gqlparser/v2 v2.5.22/go.mod h1:xMl+ta8a5M1Yo1A1Iwt/k7gSpscwSnHZdw7tfhEGfTM= +github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= +github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index f7195e3..bf86894 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -21,6 +21,7 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/expr-lang/expr" "github.com/expr-lang/expr/vm" + "github.com/gookit/goutil/dump" ) //go:embed jobclasses/* @@ -63,13 +64,7 @@ type JobClassTagger struct { cfgPath string } -func (t *JobClassTagger) prepareRule(filename string, fns string) { - b, err := os.ReadFile(filename) - if err != nil { - log.Warnf("prepareRule() > open file error: %v", err) - return - } - +func (t *JobClassTagger) prepareRule(b []byte, fns string) { var rule RuleFormat if err := json.NewDecoder(bytes.NewReader(b)).Decode(&rule); err != nil { log.Warn("Error while decoding raw job meta json") @@ -93,9 +88,7 @@ func (t *JobClassTagger) prepareRule(filename string, fns string) { } // set all required metrics - for _, m := range rule.Metrics { - ri.metrics = append(ri.metrics, m) - } + ri.metrics = append(ri.metrics, rule.Metrics...) // compile requirements for _, r := range rule.Requirements { @@ -107,6 +100,16 @@ func (t *JobClassTagger) prepareRule(filename string, fns string) { ri.requirements = append(ri.requirements, req) } + // compile variables + for _, v := range rule.Variables { + req, err := expr.Compile(v.Expr, expr.AsFloat64()) + if err != nil { + log.Errorf("error compiling requirement %s: %#v", v.Name, err) + return + } + ri.variables = append(ri.variables, ruleVariable{name: v.Name, expr: req}) + } + // compile rule exp, err := expr.Compile(rule.Rule, expr.AsBool()) if err != nil { @@ -116,7 +119,11 @@ func (t *JobClassTagger) prepareRule(filename string, fns string) { ri.rule = exp // prepare hint template - ri.hint = template.Must(template.New(fns).Parse(rule.Hint)) + ri.hint, err = template.New(fns).Parse(rule.Hint) + if err != nil { + log.Errorf("error processing template %s: %#v", fns, err) + } + log.Infof("prepareRule() > processing %s with %d requirements and %d variables", fns, len(ri.requirements), len(ri.variables)) delete(t.rules, rule.Tag) t.rules[rule.Tag] = ri @@ -137,38 +144,59 @@ func (t *JobClassTagger) EventCallback() { fns := fn.Name() log.Debugf("Process: %s", fns) filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) - t.prepareRule(filename, fns) + b, err := os.ReadFile(filename) + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return + } + t.prepareRule(b, fns) } } +func (t *JobClassTagger) initParameters() error { + log.Info("Initialize parameters") + b, err := jobclassFiles.ReadFile("jobclasses/parameters.json") + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return err + } + + if err := json.NewDecoder(bytes.NewReader(b)).Decode(&t.parameters); err != nil { + log.Warn("Error while decoding parameters.json") + return err + } + + return nil +} + func (t *JobClassTagger) Register() error { t.cfgPath = "./var/tagger/jobclasses" t.tagType = "jobClass" - files, err := appFiles.ReadDir("jobclasses") + err := t.initParameters() + if err != nil { + log.Warnf("error reading parameters.json: %v", err) + return err + } + + files, err := jobclassFiles.ReadDir("jobclasses") if err != nil { return fmt.Errorf("error reading app folder: %#v", err) } t.rules = make(map[string]ruleInfo, 0) for _, fn := range files { fns := fn.Name() - filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + if fns != "parameters.json" { + filename := fmt.Sprintf("jobclasses/%s", fns) + log.Infof("Process: %s", fns) - if fn.Name() == "parameters.json" { - b, err := os.ReadFile(filename) + b, err := jobclassFiles.ReadFile(filename) if err != nil { log.Warnf("prepareRule() > open file error: %v", err) return err } - - if err := json.NewDecoder(bytes.NewReader(b)).Decode(&t.parameters); err != nil { - log.Warn("Error while decoding parameters.json") - return err - } - continue + t.prepareRule(b, fns) } - log.Debugf("Process: %s", fns) - t.prepareRule(filename, fns) } if util.CheckFileExists(t.cfgPath) { @@ -183,6 +211,7 @@ func (t *JobClassTagger) Register() error { func (t *JobClassTagger) Match(job *schema.Job) { r := repository.GetJobRepository() jobstats, err := archive.GetStatistics(job) + log.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID) if err != nil { log.Errorf("job classification failed for job %d: %#v", job.JobID, err) return @@ -191,6 +220,16 @@ func (t *JobClassTagger) Match(job *schema.Job) { for tag, ri := range t.rules { env := make(map[string]any) maps.Copy(env, ri.env) + log.Infof("Try to match rule %s for job %d", tag, job.JobID) + env["job"] = map[string]any{ + "exclusive": job.Exclusive, + "duration": job.Duration, + "numCores": job.NumHWThreads, + "numNodes": job.NumNodes, + "jobState": job.State, + "numAcc": job.NumAcc, + "smt": job.SMT, + } // add metrics to env for _, m := range ri.metrics { @@ -225,21 +264,28 @@ func (t *JobClassTagger) Match(job *schema.Job) { env[v.name] = value } - match, err := expr.Run(ri.rule, job) + dump.P(env) + + match, err := expr.Run(ri.rule, env) if err != nil { log.Errorf("error running rule %s: %#v", tag, err) + return } if match.(bool) { + log.Info("Rule matches!") id := job.ID if !r.HasTag(id, t.tagType, tag) { r.AddTagOrCreateDirect(id, t.tagType, tag) } + } else { + log.Info("Rule does not match!") } // process hint template var msg bytes.Buffer if err := ri.hint.Execute(&msg, env); err != nil { log.Errorf("Template error: %s", err.Error()) + return } // FIXME: Handle case where multiple tags apply diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index a37924e..7945b48 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -79,10 +79,10 @@ func (t *AppTagger) Register() error { fns := fn.Name() log.Debugf("Process: %s", fns) f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) - defer f.Close() if err != nil { return fmt.Errorf("error opening app file %s: %#v", fns, err) } + defer f.Close() t.scanApp(f, fns) } @@ -97,7 +97,13 @@ func (t *AppTagger) Register() error { func (t *AppTagger) Match(job *schema.Job) { r := repository.GetJobRepository() - jobscript, ok := job.MetaData["jobScript"] + metadata, err := r.FetchMetadata(job) + if err != nil { + log.Infof("Cannot fetch metadata for job: %d on %s", job.JobID, job.Cluster) + return + } + + jobscript, ok := metadata["jobScript"] if ok { id := job.ID diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index 29d4026..2715ee8 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -7,27 +7,21 @@ "job_min_duration_seconds", "sampling_interval_seconds" ], - "metrics": [ - "cpu_load" - ], + "metrics": ["cpu_load"], "requirements": [ "job.exclusive == 1", "job.duration > job_min_duration_seconds" ], - "terms": [ - { - "name": "", - "load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')" - }, + "variables": [ { "name": "load_threshold", - "expr": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor" + "expr": "(job.numCores / job.numNodes) * excessivecpuload_threshold_factor" }, { "name": "load_perc", - "expr": "load_mean / load_threshold" + "expr": "cpu_load / load_threshold" } ], "rule": "cpu_load > load_threshold", - "hint": "This job was detected as excessiveload because the average cpu load {{ cpu_load }} falls above the threshold {{ load_threshold }}." + "hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load}} falls above the threshold {{.load_threshold}}." } diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json index 3c5bd4d..4c21a6b 100644 --- a/internal/tagger/jobclasses/lowload.json +++ b/internal/tagger/jobclasses/lowload.json @@ -6,9 +6,7 @@ "job_min_duration_seconds", "sampling_interval_seconds" ], - "metrics": [ - "cpu_load" - ], + "metrics": ["cpu_load"], "requirements": [ "job.exclusive == 1", "job.duration > job_min_duration_seconds" @@ -16,7 +14,7 @@ "variables": [ { "name": "load_threshold", - "expr": "job.numHwthreads * lowcpuload_threshold_factor" + "expr": "job.numCores * lowcpuload_threshold_factor" }, { "name": "load_perc", @@ -24,5 +22,5 @@ } ], "rule": "cpu_load < load_threshold", - "hint": "This job was detected as lowload because the average cpu load {{ cpu_load }} falls below the threshold {{ load_threshold }}." + "hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.load_threshold}}." } diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index da32fc4..04edd49 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -32,11 +32,14 @@ func newTagger() { jobTagger.startTaggers = make([]Tagger, 0) jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) jobTagger.stopTaggers = make([]Tagger, 0) - jobTagger.stopTaggers = append(jobTagger.startTaggers, &JobClassTagger{}) + jobTagger.stopTaggers = append(jobTagger.stopTaggers, &JobClassTagger{}) for _, tagger := range jobTagger.startTaggers { tagger.Register() } + for _, tagger := range jobTagger.stopTaggers { + tagger.Register() + } } func Init() { @@ -77,6 +80,7 @@ func RunTaggers() error { tagger.Match(job) } for _, tagger := range jobTagger.stopTaggers { + log.Infof("Run stop tagger for job %d", job.ID) tagger.Match(job) } } From 8d6ae85b0d76b2e25198a4e474b608345d33e34a Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 13:26:18 +0200 Subject: [PATCH 23/30] Fix bug with job columns --- internal/repository/job.go | 29 +++++++++++++++++------------ internal/repository/jobCreate.go | 2 +- internal/repository/jobFind.go | 2 +- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/internal/repository/job.go b/internal/repository/job.go index 97ca280..c6c566e 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -51,25 +51,30 @@ func GetJobRepository() *JobRepository { return jobRepoInstance } -// var jobColumns []string = []string{ -// "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", -// "job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", -// "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", -// } - var jobColumns []string = []string{ - "id", "job_id", "hpc_user", "project", "cluster", "subcluster", "start_time", - "cluster_partition", "array_job_id", "num_nodes", "num_hwthreads", "num_acc", - "exclusive", "monitoring_status", "smt", "job_state", "duration", "walltime", - "resources", "footprint", "energy", + "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", + "job.start_time", "job.cluster_partition", "job.array_job_id", "job.num_nodes", + "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", + "job.smt", "job.job_state", "job.duration", "job.walltime", "job.resources", + "job.footprint", "job.energy", +} + +var jobCacheColumns []string = []string{ + "jobcache.id", "jobcache.job_id", "jobcache.hpc_user", "jobcache.project", "jobcache.cluster", + "jobcache.subcluster", "jobcache.start_time", "jobcache.cluster_partition", + "jobcache.array_job_id", "jobcache.num_nodes", "jobcache.num_hwthreads", + "jobcache.num_acc", "jobcache.exclusive", "jobcache.monitoring_status", "jobcache.smt", + "jobcache.job_state", "jobcache.duration", "jobcache.walltime", "jobcache.resources", + "jobcache.footprint", "jobcache.energy", } func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { job := &schema.Job{} if err := row.Scan( - &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, - &job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, + &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, + &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, &job.NumNodes, &job.NumHWThreads, + &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, &job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint, &job.Energy); err != nil { log.Warnf("Error while scanning rows (Job): %v", err) return nil, err diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index a651db9..f286c68 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -50,7 +50,7 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { r.Mutex.Lock() defer r.Mutex.Unlock() - query := sq.Select(jobColumns...).From("job_cache") + query := sq.Select(jobCacheColumns...).From("job_cache") rows, err := query.RunWith(r.stmtCache).Query() if err != nil { diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index 614b7c0..b820084 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -48,7 +48,7 @@ func (r *JobRepository) FindCached( cluster *string, startTime *int64, ) (*schema.Job, error) { - q := sq.Select(jobColumns...).From("job_cache"). + q := sq.Select(jobCacheColumns...).From("job_cache"). Where("job_cache.job_id = ?", *jobId) if cluster != nil { From 0261c263f96e50196ce87db0f01c848d111caf87 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 13:36:23 +0200 Subject: [PATCH 24/30] Add hint message only if rule matches --- internal/tagger/classifyJob.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index bf86894..9c4f7cb 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -277,18 +277,18 @@ func (t *JobClassTagger) Match(job *schema.Job) { if !r.HasTag(id, t.tagType, tag) { r.AddTagOrCreateDirect(id, t.tagType, tag) } + + // process hint template + var msg bytes.Buffer + if err := ri.hint.Execute(&msg, env); err != nil { + log.Errorf("Template error: %s", err.Error()) + return + } + + // FIXME: Handle case where multiple tags apply + r.UpdateMetadata(job, "message", msg.String()) } else { log.Info("Rule does not match!") } - - // process hint template - var msg bytes.Buffer - if err := ri.hint.Execute(&msg, env); err != nil { - log.Errorf("Template error: %s", err.Error()) - return - } - - // FIXME: Handle case where multiple tags apply - r.UpdateMetadata(job, "message", msg.String()) } } From 1e7fbe5d561263b735e13b9ee43d35388e9abad1 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 13:40:34 +0200 Subject: [PATCH 25/30] Refactor --- internal/repository/jobQuery.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go index 6a2ddec..2f72e77 100644 --- a/internal/repository/jobQuery.go +++ b/internal/repository/jobQuery.go @@ -148,9 +148,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select } if filter.DbID != nil { dbIDs := make([]string, len(filter.DbID)) - for i, val := range filter.DbID { - dbIDs[i] = val - } + copy(dbIDs, filter.DbID) query = query.Where(sq.Eq{"job.id": dbIDs}) } if filter.JobID != nil { From 9b325041c14040c5c79d3e5fe6ec0985c4bb40ba Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 14:30:30 +0200 Subject: [PATCH 26/30] Fix typo in jobCache columns --- internal/repository/job.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/repository/job.go b/internal/repository/job.go index c6c566e..3702099 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -60,12 +60,12 @@ var jobColumns []string = []string{ } var jobCacheColumns []string = []string{ - "jobcache.id", "jobcache.job_id", "jobcache.hpc_user", "jobcache.project", "jobcache.cluster", - "jobcache.subcluster", "jobcache.start_time", "jobcache.cluster_partition", - "jobcache.array_job_id", "jobcache.num_nodes", "jobcache.num_hwthreads", - "jobcache.num_acc", "jobcache.exclusive", "jobcache.monitoring_status", "jobcache.smt", - "jobcache.job_state", "jobcache.duration", "jobcache.walltime", "jobcache.resources", - "jobcache.footprint", "jobcache.energy", + "job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.cluster", + "job_cache.subcluster", "job_cache.start_time", "job_cache.cluster_partition", + "job_cache.array_job_id", "job_cache.num_nodes", "job_cache.num_hwthreads", + "job_cache.num_acc", "job_cache.exclusive", "job_cache.monitoring_status", "job_cache.smt", + "job_cache.job_state", "job_cache.duration", "job_cache.walltime", "job_cache.resources", + "job_cache.footprint", "job_cache.energy", } func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { From 80032170923162edffa9a5e86609a7de38c384cd Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 14:41:02 +0200 Subject: [PATCH 27/30] Add string to gromacs app file --- internal/tagger/apps/gromacs.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/tagger/apps/gromacs.txt b/internal/tagger/apps/gromacs.txt index d8c0829..c5d939b 100644 --- a/internal/tagger/apps/gromacs.txt +++ b/internal/tagger/apps/gromacs.txt @@ -1,3 +1,4 @@ GROMACS gromacs GMX +mdrun From 5a88c77171a7c41a46c3b12577161462139cab64 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 14:42:41 +0200 Subject: [PATCH 28/30] Remove debug output --- go.mod | 9 +++------ go.sum | 6 ------ internal/tagger/classifyJob.go | 3 +-- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/go.mod b/go.mod index c57d9ed..6c92171 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,8 @@ require ( github.com/ClusterCockpit/cc-units v0.4.0 github.com/Masterminds/squirrel v1.5.4 github.com/coreos/go-oidc/v3 v3.12.0 + github.com/expr-lang/expr v1.17.3 + github.com/fsnotify/fsnotify v1.9.0 github.com/go-co-op/gocron/v2 v2.16.0 github.com/go-ldap/ldap/v3 v3.4.10 github.com/go-sql-driver/mysql v1.9.0 @@ -20,6 +22,7 @@ require ( github.com/gorilla/sessions v1.4.0 github.com/influxdata/influxdb-client-go/v2 v2.14.0 github.com/jmoiron/sqlx v1.4.0 + github.com/joho/godotenv v1.5.1 github.com/mattn/go-sqlite3 v1.14.24 github.com/prometheus/client_golang v1.21.0 github.com/prometheus/common v0.62.0 @@ -43,9 +46,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect - github.com/expr-lang/expr v1.17.3 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect github.com/go-jose/go-jose/v4 v4.0.5 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect @@ -54,15 +55,12 @@ require ( github.com/go-openapi/swag v0.23.0 // indirect github.com/go-viper/mapstructure/v2 v2.2.1 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/gookit/color v1.5.4 // indirect - github.com/gookit/goutil v0.6.18 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect - github.com/joho/godotenv v1.5.1 // indirect github.com/jonboulle/clockwork v0.5.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect @@ -82,7 +80,6 @@ require ( github.com/sosodev/duration v1.3.1 // indirect github.com/swaggo/files v1.0.1 // indirect github.com/urfave/cli/v2 v2.27.5 // indirect - github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect go.uber.org/atomic v1.11.0 // indirect golang.org/x/mod v0.23.0 // indirect diff --git a/go.sum b/go.sum index 2102888..b4c3781 100644 --- a/go.sum +++ b/go.sum @@ -101,10 +101,6 @@ github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark= github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gookit/color v1.5.4 h1:FZmqs7XOyGgCAxmWyPslpiok1k05wmY3SJTytgvYFs0= -github.com/gookit/color v1.5.4/go.mod h1:pZJOeOS8DM43rXbp4AZo1n9zCU2qjpcRko0b6/QJi9w= -github.com/gookit/goutil v0.6.18 h1:MUVj0G16flubWT8zYVicIuisUiHdgirPAkmnfD2kKgw= -github.com/gookit/goutil v0.6.18/go.mod h1:AY/5sAwKe7Xck+mEbuxj0n/bc3qwrGNe3Oeulln7zBA= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= @@ -245,8 +241,6 @@ github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w= github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ= github.com/vektah/gqlparser/v2 v2.5.22 h1:yaaeJ0fu+nv1vUMW0Hl+aS1eiv1vMfapBNjpffAda1I= github.com/vektah/gqlparser/v2 v2.5.22/go.mod h1:xMl+ta8a5M1Yo1A1Iwt/k7gSpscwSnHZdw7tfhEGfTM= -github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= -github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index 9c4f7cb..16afe63 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -21,7 +21,6 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/expr-lang/expr" "github.com/expr-lang/expr/vm" - "github.com/gookit/goutil/dump" ) //go:embed jobclasses/* @@ -264,7 +263,7 @@ func (t *JobClassTagger) Match(job *schema.Job) { env[v.name] = value } - dump.P(env) + // dump.P(env) match, err := expr.Run(ri.rule, env) if err != nil { From 0aecea6de21932d99a2d7632aa0b01a1525472a4 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 27 May 2025 09:23:28 +0200 Subject: [PATCH 29/30] Refactor. Add Subcluster get metric list helper routine. --- internal/tagger/classifyJob.go | 34 +++++++++++++++++----- pkg/archive/clusterConfig.go | 53 ++++++++++++++++++++++++++++++---- pkg/archive/nodelist.go | 9 +++--- pkg/schema/cluster.go | 32 ++++++++++---------- 4 files changed, 94 insertions(+), 34 deletions(-) diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index 16afe63..0af6738 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -124,7 +124,6 @@ func (t *JobClassTagger) prepareRule(b []byte, fns string) { } log.Infof("prepareRule() > processing %s with %d requirements and %d variables", fns, len(ri.requirements), len(ri.variables)) - delete(t.rules, rule.Tag) t.rules[rule.Tag] = ri } @@ -139,16 +138,33 @@ func (t *JobClassTagger) EventCallback() { log.Fatal(err) } - for _, fn := range files { - fns := fn.Name() - log.Debugf("Process: %s", fns) - filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) - b, err := os.ReadFile(filename) + if util.CheckFileExists(t.cfgPath + "/parameters.json") { + log.Info("Merge parameters") + b, err := os.ReadFile(t.cfgPath + "/parameters.json") if err != nil { log.Warnf("prepareRule() > open file error: %v", err) - return } - t.prepareRule(b, fns) + + var paramTmp map[string]any + if err := json.NewDecoder(bytes.NewReader(b)).Decode(¶mTmp); err != nil { + log.Warn("Error while decoding parameters.json") + } + + maps.Copy(t.parameters, paramTmp) + } + + for _, fn := range files { + fns := fn.Name() + if fns != "parameters.json" { + log.Debugf("Process: %s", fns) + filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + b, err := os.ReadFile(filename) + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return + } + t.prepareRule(b, fns) + } } } @@ -220,6 +236,8 @@ func (t *JobClassTagger) Match(job *schema.Job) { env := make(map[string]any) maps.Copy(env, ri.env) log.Infof("Try to match rule %s for job %d", tag, job.JobID) + + // Initialize environment env["job"] = map[string]any{ "exclusive": job.Exclusive, "duration": job.Duration, diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index d53941b..95520a0 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -69,16 +69,18 @@ func initClusterConfig() error { for _, sc := range cluster.SubClusters { newMetric := &schema.MetricConfig{ - Unit: mc.Unit, + Metric: schema.Metric{ + Name: mc.Name, + Unit: mc.Unit, + Peak: mc.Peak, + Normal: mc.Normal, + Caution: mc.Caution, + Alert: mc.Alert, + }, Energy: mc.Energy, - Name: mc.Name, Scope: mc.Scope, Aggregation: mc.Aggregation, - Peak: mc.Peak, - Caution: mc.Caution, - Alert: mc.Alert, Timestep: mc.Timestep, - Normal: mc.Normal, LowerIsBetter: mc.LowerIsBetter, } @@ -167,6 +169,45 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) { return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster) } +func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Metric { + metrics := make(map[string]*schema.Metric) + + for _, c := range Clusters { + if c.Name == cluster { + for _, m := range c.MetricConfig { + for _, s := range m.SubClusters { + if s.Name == subcluster { + metrics[m.Name] = &schema.Metric{ + Name: m.Name, + Unit: s.Unit, + Peak: s.Peak, + Normal: s.Normal, + Caution: s.Caution, + Alert: s.Alert, + } + break + } + } + + _, ok := metrics[m.Name] + if !ok { + metrics[m.Name] = &schema.Metric{ + Name: m.Name, + Unit: m.Unit, + Peak: m.Peak, + Normal: m.Normal, + Caution: m.Caution, + Alert: m.Alert, + } + } + } + break + } + } + + return metrics +} + func GetMetricConfig(cluster, metric string) *schema.MetricConfig { for _, c := range Clusters { if c.Name == cluster { diff --git a/pkg/archive/nodelist.go b/pkg/archive/nodelist.go index 7700185..26a15d2 100644 --- a/pkg/archive/nodelist.go +++ b/pkg/archive/nodelist.go @@ -61,7 +61,7 @@ func (nl *NodeList) PrintList() []string { } func (nl *NodeList) NodeCount() int { - var out int = 0 + out := 0 for _, term := range *nl { if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one out += 1 @@ -160,7 +160,7 @@ func (nle NLExprIntRange) limits() []map[string]int { m["start"] = int(nle.start) m["end"] = int(nle.end) m["digits"] = int(nle.digits) - if nle.zeroPadded == true { + if nle.zeroPadded { m["zeroPadded"] = 1 } else { m["zeroPadded"] = 0 @@ -183,14 +183,15 @@ func ParseNodeList(raw string) (NodeList, error) { rawterms := []string{} prevterm := 0 for i := 0; i < len(raw); i++ { - if raw[i] == '[' { + switch raw[i] { + case '[': for i < len(raw) && raw[i] != ']' { i++ } if i == len(raw) { return nil, fmt.Errorf("ARCHIVE/NODELIST > unclosed '['") } - } else if raw[i] == ',' { + case ',': rawterms = append(rawterms, raw[prevterm:i]) prevterm = i + 1 } diff --git a/pkg/schema/cluster.go b/pkg/schema/cluster.go index 322f308..1b9f2cc 100644 --- a/pkg/schema/cluster.go +++ b/pkg/schema/cluster.go @@ -45,31 +45,31 @@ type SubCluster struct { ThreadsPerCore int `json:"threadsPerCore"` } +type Metric struct { + Name string `json:"name"` + Unit Unit `json:"unit"` + Peak float64 `json:"peak"` + Normal float64 `json:"normal"` + Caution float64 `json:"caution"` + Alert float64 `json:"alert"` +} + type SubClusterConfig struct { - Name string `json:"name"` - Footprint string `json:"footprint,omitempty"` - Energy string `json:"energy"` - Peak float64 `json:"peak"` - Normal float64 `json:"normal"` - Caution float64 `json:"caution"` - Alert float64 `json:"alert"` - Remove bool `json:"remove"` - LowerIsBetter bool `json:"lowerIsBetter"` + Metric + Footprint string `json:"footprint,omitempty"` + Energy string `json:"energy"` + Remove bool `json:"remove"` + LowerIsBetter bool `json:"lowerIsBetter"` } type MetricConfig struct { - Unit Unit `json:"unit"` + Metric Energy string `json:"energy"` - Name string `json:"name"` Scope MetricScope `json:"scope"` Aggregation string `json:"aggregation"` Footprint string `json:"footprint,omitempty"` SubClusters []*SubClusterConfig `json:"subClusters,omitempty"` - Peak float64 `json:"peak"` - Caution float64 `json:"caution"` - Alert float64 `json:"alert"` Timestep int `json:"timestep"` - Normal float64 `json:"normal"` LowerIsBetter bool `json:"lowerIsBetter"` } @@ -127,7 +127,7 @@ func (topo *Topology) GetSocketsFromHWThreads( // those in the argument list are assigned to one of the sockets in the first // return value, return true as the second value. TODO: Optimize this, there // must be a more efficient way/algorithm. -func (topo *Topology) GetSocketsFromCores ( +func (topo *Topology) GetSocketsFromCores( cores []int, ) (sockets []int, exclusive bool) { socketsMap := map[int]int{} From cdfe7224576a7db0b798327b5f145a8169ae2eb2 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 27 May 2025 13:02:13 +0200 Subject: [PATCH 30/30] Include metric thresholds in rule environment Not yet tested --- internal/tagger/classifyJob.go | 13 ++++++++++++- internal/tagger/jobclasses/highload.json | 2 +- internal/tagger/jobclasses/lowload.json | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index 0af6738..6fd3fae 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -226,6 +226,7 @@ func (t *JobClassTagger) Register() error { func (t *JobClassTagger) Match(job *schema.Job) { r := repository.GetJobRepository() jobstats, err := archive.GetStatistics(job) + metricsList := archive.GetMetricConfigSubCluster(job.Cluster, job.SubCluster) log.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID) if err != nil { log.Errorf("job classification failed for job %d: %#v", job.JobID, err) @@ -255,7 +256,17 @@ func (t *JobClassTagger) Match(job *schema.Job) { log.Errorf("job classification failed for job %d: missing metric '%s'", job.JobID, m) return } - env[m] = stats.Avg + env[m] = map[string]any{ + "min": stats.Min, + "max": stats.Max, + "avg": stats.Avg, + "limits": map[string]float64{ + "peak": metricsList[m].Peak, + "normal": metricsList[m].Normal, + "caution": metricsList[m].Caution, + "alert": metricsList[m].Alert, + }, + } } // check rule requirements apply diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index 2715ee8..444ca4d 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -19,7 +19,7 @@ }, { "name": "load_perc", - "expr": "cpu_load / load_threshold" + "expr": "cpu_load.avg / load_threshold" } ], "rule": "cpu_load > load_threshold", diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json index 4c21a6b..1d7e041 100644 --- a/internal/tagger/jobclasses/lowload.json +++ b/internal/tagger/jobclasses/lowload.json @@ -21,6 +21,6 @@ "expr": "1.0 - (cpu_load / load_threshold)" } ], - "rule": "cpu_load < load_threshold", + "rule": "cpu_load.avg < load_threshold", "hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.load_threshold}}." }