From d839c536423ee651f52fa03f76695120ec159296 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 22 Aug 2023 10:56:32 +0200 Subject: [PATCH 01/45] Add initial structure --- internal/tagger/apps/gromacs.txt | 0 internal/tagger/apps/openfoam.txt | 0 internal/tagger/apps/vasp.txt | 0 internal/tagger/detectApp.go | 20 ++++++++++++++++++++ internal/tagger/tagger.go | 17 +++++++++++++++++ 5 files changed, 37 insertions(+) create mode 100644 internal/tagger/apps/gromacs.txt create mode 100644 internal/tagger/apps/openfoam.txt create mode 100644 internal/tagger/apps/vasp.txt create mode 100644 internal/tagger/detectApp.go create mode 100644 internal/tagger/tagger.go diff --git a/internal/tagger/apps/gromacs.txt b/internal/tagger/apps/gromacs.txt new file mode 100644 index 0000000..e69de29 diff --git a/internal/tagger/apps/openfoam.txt b/internal/tagger/apps/openfoam.txt new file mode 100644 index 0000000..e69de29 diff --git a/internal/tagger/apps/vasp.txt b/internal/tagger/apps/vasp.txt new file mode 100644 index 0000000..e69de29 diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go new file mode 100644 index 0000000..298151b --- /dev/null +++ b/internal/tagger/detectApp.go @@ -0,0 +1,20 @@ +// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +const tagType = "app" + +type appInfo struct { + tag string + strings []string +} +type AppTagger struct { + apps []appInfo +} + +func (t *AppTagger) Register() error { + + return nil +} diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go new file mode 100644 index 0000000..52a369b --- /dev/null +++ b/internal/tagger/tagger.go @@ -0,0 +1,17 @@ +// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import "github.com/ClusterCockpit/cc-backend/pkg/schema" + +type Tagger interface { + Register() error + Match(job *schema.Job) +} + +func Init() error { + + return nil +} From dc0d9fe038a0a940d1b00ceb24f259721e3a293c Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 27 Sep 2023 15:01:08 +0200 Subject: [PATCH 02/45] Add more tags to test db --- internal/repository/testdata/job.db | Bin 114688 -> 114688 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/internal/repository/testdata/job.db b/internal/repository/testdata/job.db index 4b00aa55f041b70f717177bc7baef9eb69d1a226..4685f7f85704574eab5cbd27a08f41eb71bf719d 100644 GIT binary patch delta 240 zcmZo@U~gz(-@xL)!Nk9ff&V7|w#|Y9tN4`_nOGUbMU4v*OEU5^Q*u&`42+C*4J>ty z%oPlctV}Je49zFs^w$RJy3fFWpZ^_D*JXZwJ{ECNMz|iP$?xLDxS07jGVnj;Kh3`p zsAe&LtOPR)Bd0M7vVq1{1_oA!h7wF58CDD#15-&x7Dh&6RwiUApern`Of0z=7#NuO oUo-H(=KsY16KKUjt@f&b=aL4ou9lW+QK1NrY5_}>BfPxvRlix=Z!5MW^7{{n>X y!D2G3oD2*MJe+Jml8uvv3rK-geC7v2po&NQ0&I-TKtWzcHYSLgriNt=j1B+{bsGKv From ba7cc9168e74a925c06ac108355b221d5fdf0e88 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 28 Sep 2023 10:20:20 +0200 Subject: [PATCH 03/45] feat: add automatic application detection and tagging --- internal/repository/job_test.go | 30 +++++++++------ internal/repository/tags.go | 13 +++++++ internal/tagger/apps/gromacs.txt | 3 ++ internal/tagger/apps/openfoam.txt | 1 + internal/tagger/apps/vasp.txt | 2 + internal/tagger/detectApp.go | 64 +++++++++++++++++++++++++++++++ internal/tagger/detectApp_test.go | 59 ++++++++++++++++++++++++++++ 7 files changed, 161 insertions(+), 11 deletions(-) create mode 100644 internal/tagger/detectApp_test.go diff --git a/internal/repository/job_test.go b/internal/repository/job_test.go index c3f76a7..986365c 100644 --- a/internal/repository/job_test.go +++ b/internal/repository/job_test.go @@ -16,9 +16,7 @@ func TestFind(t *testing.T) { jobId, cluster, startTime := int64(398998), "fritz", int64(1675957496) job, err := r.Find(&jobId, &cluster, &startTime) - if err != nil { - t.Fatal(err) - } + noErr(t, err) // fmt.Printf("%+v", job) @@ -31,9 +29,7 @@ func TestFindById(t *testing.T) { r := setup(t) job, err := r.FindById(5) - if err != nil { - t.Fatal(err) - } + noErr(t, err) // fmt.Printf("%+v", job) @@ -46,14 +42,26 @@ func TestGetTags(t *testing.T) { r := setup(t) tags, counts, err := r.CountTags(nil) - if err != nil { - t.Fatal(err) - } + noErr(t, err) fmt.Printf("TAGS %+v \n", tags) // fmt.Printf("COUNTS %+v \n", counts) - if counts["bandwidth"] != 0 { - t.Errorf("wrong tag count \ngot: %d \nwant: 0", counts["bandwidth"]) + if counts["bandwidth"] != 2 { + t.Errorf("wrong tag count \ngot: %d \nwant: 2", counts["bandwidth"]) + } +} + +func TestHasTag(t *testing.T) { + r := setup(t) + + if !r.HasTag(5, "util", "bandwidth") { + t.Errorf("Expected has tag") + } + if r.HasTag(4, "patho", "idle") { + t.Errorf("Expected has not tag") + } + if !r.HasTag(5, "patho", "idle") { + t.Errorf("Expected has tag") } } diff --git a/internal/repository/tags.go b/internal/repository/tags.go index 52bc836..a6a41b6 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -134,6 +134,19 @@ func (r *JobRepository) AddTagOrCreate(jobId int64, tagType string, tagName stri return tagId, nil } +func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool { + var id int64 + q := sq.Select("id").From("tag").Join("jobtag ON jobtag.tag_id = tag.id"). + Where("jobtag.job_id = ?", jobId).Where("tag.tag_type = ?", tagType). + Where("tag.tag_name = ?", tagName) + err := q.RunWith(r.stmtCache).QueryRow().Scan(&id) + if err != nil { + return false + } else { + return true + } +} + // TagId returns the database id of the tag with the specified type and name. func (r *JobRepository) TagId(tagType string, tagName string) (tagId int64, exists bool) { exists = true diff --git a/internal/tagger/apps/gromacs.txt b/internal/tagger/apps/gromacs.txt index e69de29..d8c0829 100644 --- a/internal/tagger/apps/gromacs.txt +++ b/internal/tagger/apps/gromacs.txt @@ -0,0 +1,3 @@ +GROMACS +gromacs +GMX diff --git a/internal/tagger/apps/openfoam.txt b/internal/tagger/apps/openfoam.txt index e69de29..542d645 100644 --- a/internal/tagger/apps/openfoam.txt +++ b/internal/tagger/apps/openfoam.txt @@ -0,0 +1 @@ +openfoam diff --git a/internal/tagger/apps/vasp.txt b/internal/tagger/apps/vasp.txt index e69de29..eec9092 100644 --- a/internal/tagger/apps/vasp.txt +++ b/internal/tagger/apps/vasp.txt @@ -0,0 +1,2 @@ +VASP +vasp diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 298151b..714fd27 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -4,17 +4,81 @@ // license that can be found in the LICENSE file. package tagger +import ( + "bufio" + "embed" + "fmt" + "path/filepath" + "strings" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + const tagType = "app" +//go:embed apps/* +var appFiles embed.FS + type appInfo struct { tag string strings []string } + type AppTagger struct { apps []appInfo } func (t *AppTagger) Register() error { + files, err := appFiles.ReadDir("apps") + if err != nil { + return fmt.Errorf("error reading app folder: %#v", err) + } + t.apps = make([]appInfo, 0) + + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + if err != nil { + return fmt.Errorf("error opening app file %s: %#v", fns, err) + } + scanner := bufio.NewScanner(f) + ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)} + + for scanner.Scan() { + ai.strings = append(ai.strings, scanner.Text()) + } + t.apps = append(t.apps, ai) + } return nil } + +func (t *AppTagger) Match(job *schema.Job) { + r := repository.GetJobRepository() + meta, err := r.FetchMetadata(job) + if err != nil { + log.Error("cannot fetch meta data") + } + jobscript, ok := meta["jobScript"] + if ok { + id := job.ID + + out: + for _, a := range t.apps { + tag := a.tag + for _, s := range a.strings { + if strings.Contains(jobscript, s) { + if !r.HasTag(id, tagType, tag) { + r.AddTagOrCreate(id, tagType, tag) + break out + } + } + } + } + } else { + log.Infof("Cannot extract job script for job: %d on %s", job.JobID, job.Cluster) + } +} diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go new file mode 100644 index 0000000..54a8dfd --- /dev/null +++ b/internal/tagger/detectApp_test.go @@ -0,0 +1,59 @@ +// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "testing" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" +) + +func setup(tb testing.TB) *repository.JobRepository { + tb.Helper() + log.Init("warn", true) + dbfile := "../repository/testdata/job.db" + err := repository.MigrateDB("sqlite3", dbfile) + noErr(tb, err) + repository.Connect("sqlite3", dbfile) + return repository.GetJobRepository() +} + +func noErr(tb testing.TB, err error) { + tb.Helper() + + if err != nil { + tb.Fatal("Error is not nil:", err) + } +} + +func TestRegister(t *testing.T) { + var tagger AppTagger + + err := tagger.Register() + noErr(t, err) + + if len(tagger.apps) != 3 { + t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 3", len(tagger.apps)) + } +} + +func TestMatch(t *testing.T) { + r := setup(t) + + job, err := r.FindById(5) + noErr(t, err) + + var tagger AppTagger + + err = tagger.Register() + noErr(t, err) + + tagger.Match(job) + + if !r.HasTag(5, "app", "vasp") { + t.Errorf("missing tag vasp") + } +} From 2502989ca2c33d654f923687db8c53c5b44c8b5b Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 28 Sep 2023 10:20:35 +0200 Subject: [PATCH 04/45] Refactor --- pkg/schema/validate.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/schema/validate.go b/pkg/schema/validate.go index aec234c..77b6dbb 100644 --- a/pkg/schema/validate.go +++ b/pkg/schema/validate.go @@ -28,12 +28,13 @@ const ( //go:embed schemas/* var schemaFiles embed.FS -func Validate(k Kind, r io.Reader) (err error) { +func Validate(k Kind, r io.Reader) error { jsonschema.Loaders["embedfs"] = func(s string) (io.ReadCloser, error) { f := filepath.Join("schemas", strings.Split(s, "//")[1]) return schemaFiles.Open(f) } var s *jsonschema.Schema + var err error switch k { case Meta: @@ -54,7 +55,7 @@ func Validate(k Kind, r io.Reader) (err error) { } var v interface{} - if err := json.NewDecoder(r).Decode(&v); err != nil { + if err = json.NewDecoder(r).Decode(&v); err != nil { log.Warnf("Error while decoding raw json schema: %#v", err) return err } From efbe53b6b45a79fe432814189a5e99b2a9dbbaa4 Mon Sep 17 00:00:00 2001 From: AmritanshuV <88365075+AmritanshuV@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:40:57 +0200 Subject: [PATCH 05/45] Rules --- internal/tagger/rules.json | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 internal/tagger/rules.json diff --git a/internal/tagger/rules.json b/internal/tagger/rules.json new file mode 100644 index 0000000..c88afb4 --- /dev/null +++ b/internal/tagger/rules.json @@ -0,0 +1,21 @@ +{ + "and": [ + { + "in": [ + "a40", + { + "var": "metaData.jobScript" + } + ] + }, + { + ">": [ + { + "var": "statistics.clock.min" + }, + 2000 + ] + } + ] + } + \ No newline at end of file From 7abdd0545e5a2ad7e1906411f38c1397185e8ef9 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 07:24:24 +0200 Subject: [PATCH 06/45] Add api for tag handling within cc-backend --- internal/graph/schema.resolvers.go | 37 +++++++------- internal/repository/tags.go | 78 +++++++++++++++++++++++++++++- internal/tagger/detectApp.go | 6 +-- internal/tagger/detectApp_test.go | 2 +- 4 files changed, 98 insertions(+), 25 deletions(-) diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index f3fc389..7e52b3d 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -143,7 +143,7 @@ func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name s return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil } else { log.Warnf("Not authorized to create tag with scope: %s", scope) - return nil, fmt.Errorf("Not authorized to create tag with scope: %s", scope) + return nil, fmt.Errorf("not authorized to create tag with scope: %s", scope) } } @@ -179,7 +179,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds _, _, tscope, exists := r.Repo.TagInfo(tid) if !exists { log.Warnf("Tag does not exist (ID): %d", tid) - return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) + return nil, fmt.Errorf("tag does not exist (ID): %d", tid) } // Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag @@ -193,7 +193,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds } } else { log.Warnf("Not authorized to add tag: %d", tid) - return nil, fmt.Errorf("Not authorized to add tag: %d", tid) + return nil, fmt.Errorf("not authorized to add tag: %d", tid) } } @@ -226,7 +226,7 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta _, _, tscope, exists := r.Repo.TagInfo(tid) if !exists { log.Warnf("Tag does not exist (ID): %d", tid) - return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) + return nil, fmt.Errorf("tag does not exist (ID): %d", tid) } // Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag @@ -240,7 +240,7 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta } } else { log.Warnf("Not authorized to remove tag: %d", tid) - return nil, fmt.Errorf("Not authorized to remove tag: %d", tid) + return nil, fmt.Errorf("not authorized to remove tag: %d", tid) } } @@ -269,7 +269,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin _, _, tscope, exists := r.Repo.TagInfo(tid) if !exists { log.Warnf("Tag does not exist (ID): %d", tid) - return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) + return nil, fmt.Errorf("tag does not exist (ID): %d", tid) } // Test Access: Admins && Admin Tag OR Everyone && Private Tag @@ -283,7 +283,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin } } else { log.Warnf("Not authorized to remove tag: %d", tid) - return nil, fmt.Errorf("Not authorized to remove tag: %d", tid) + return nil, fmt.Errorf("not authorized to remove tag: %d", tid) } } return tags, nil @@ -499,10 +499,7 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag return nil, err } - hasNextPage := false - if len(nextJobs) == 1 { - hasNextPage = true - } + hasNextPage := len(nextJobs) == 1 return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil } @@ -513,8 +510,8 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF var stats []*model.JobsStatistics // Top Level Defaults - var defaultDurationBins string = "1h" - var defaultMetricBins int = 10 + defaultDurationBins := "1h" + defaultMetricBins := 10 if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") || requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") { @@ -779,9 +776,11 @@ func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} } // SubCluster returns generated.SubClusterResolver implementation. func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} } -type clusterResolver struct{ *Resolver } -type jobResolver struct{ *Resolver } -type metricValueResolver struct{ *Resolver } -type mutationResolver struct{ *Resolver } -type queryResolver struct{ *Resolver } -type subClusterResolver struct{ *Resolver } +type ( + clusterResolver struct{ *Resolver } + jobResolver struct{ *Resolver } + metricValueResolver struct{ *Resolver } + mutationResolver struct{ *Resolver } + queryResolver struct{ *Resolver } + subClusterResolver struct{ *Resolver } +) diff --git a/internal/repository/tags.go b/internal/repository/tags.go index d07c4d2..a9416c4 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -45,6 +45,36 @@ func (r *JobRepository) AddTag(user *schema.User, job int64, tag int64) ([]*sche return tags, archive.UpdateTags(j, archiveTags) } +func (r *JobRepository) AddTagDirect(job int64, tag int64) ([]*schema.Tag, error) { + j, err := r.FindByIdDirect(job) + if err != nil { + log.Warn("Error while finding job by id") + return nil, err + } + + q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(job, tag) + + if _, err := q.RunWith(r.stmtCache).Exec(); err != nil { + s, _, _ := q.ToSql() + log.Errorf("Error adding tag with %s: %v", s, err) + return nil, err + } + + tags, err := r.GetTagsDirect(&job) + if err != nil { + log.Warn("Error while getting tags for job") + return nil, err + } + + archiveTags, err := r.getArchiveTags(&job) + if err != nil { + log.Warn("Error while getting tags for job") + return nil, err + } + + return tags, archive.UpdateTags(j, archiveTags) +} + // Removes a tag from a job by tag id func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) { j, err := r.FindByIdWithUser(user, job) @@ -82,7 +112,7 @@ func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagT tagID, exists := r.TagId(tagType, tagName, tagScope) if !exists { log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) - return nil, fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) + return nil, fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } // Get Job @@ -122,7 +152,7 @@ func (r *JobRepository) RemoveTagByRequest(tagType string, tagName string, tagSc tagID, exists := r.TagId(tagType, tagName, tagScope) if !exists { log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) - return fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) + return fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } // Handle Delete JobTagTable @@ -291,6 +321,24 @@ func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType s return tagId, nil } +func (r *JobRepository) AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) { + tagScope := "global" + + tagId, exists := r.TagId(tagType, tagName, tagScope) + if !exists { + tagId, err = r.CreateTag(tagType, tagName, tagScope) + if err != nil { + return 0, err + } + } + + if _, err := r.AddTagDirect(jobId, tagId); err != nil { + return 0, err + } + + return tagId, nil +} + func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool { var id int64 q := sq.Select("id").From("tag").Join("jobtag ON jobtag.tag_id = tag.id"). @@ -359,6 +407,32 @@ func (r *JobRepository) GetTags(user *schema.User, job *int64) ([]*schema.Tag, e return tags, nil } +func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) { + q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") + if job != nil { + q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job) + } + + rows, err := q.RunWith(r.stmtCache).Query() + if err != nil { + s, _, _ := q.ToSql() + log.Errorf("Error get tags with %s: %v", s, err) + return nil, err + } + + tags := make([]*schema.Tag, 0) + for rows.Next() { + tag := &schema.Tag{} + if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { + log.Warn("Error while scanning rows") + return nil, err + } + tags = append(tags, tag) + } + + return tags, nil +} + // GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has. func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 714fd27..339e398 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -1,5 +1,5 @@ -// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. package tagger @@ -72,7 +72,7 @@ func (t *AppTagger) Match(job *schema.Job) { for _, s := range a.strings { if strings.Contains(jobscript, s) { if !r.HasTag(id, tagType, tag) { - r.AddTagOrCreate(id, tagType, tag) + r.AddTagOrCreateDirect(id, tagType, tag) break out } } diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go index 54a8dfd..8978e35 100644 --- a/internal/tagger/detectApp_test.go +++ b/internal/tagger/detectApp_test.go @@ -43,7 +43,7 @@ func TestRegister(t *testing.T) { func TestMatch(t *testing.T) { r := setup(t) - job, err := r.FindById(5) + job, err := r.FindByIdDirect(5) noErr(t, err) var tagger AppTagger From fe1ff5c7a3da53d0fe161814d34638a1ea35a16c Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 07:33:33 +0200 Subject: [PATCH 07/45] Update tests from dev --- internal/repository/job_test.go | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/internal/repository/job_test.go b/internal/repository/job_test.go index 70d8053..363bb6c 100644 --- a/internal/repository/job_test.go +++ b/internal/repository/job_test.go @@ -18,7 +18,9 @@ func TestFind(t *testing.T) { jobId, cluster, startTime := int64(398998), "fritz", int64(1675957496) job, err := r.Find(&jobId, &cluster, &startTime) - noErr(t, err) + if err != nil { + t.Fatal(err) + } // fmt.Printf("%+v", job) @@ -65,21 +67,7 @@ func TestGetTags(t *testing.T) { fmt.Printf("TAGS %+v \n", tags) // fmt.Printf("COUNTS %+v \n", counts) - if counts["bandwidth"] != 2 { - t.Errorf("wrong tag count \ngot: %d \nwant: 2", counts["bandwidth"]) - } -} - -func TestHasTag(t *testing.T) { - r := setup(t) - - if !r.HasTag(5, "util", "bandwidth") { - t.Errorf("Expected has tag") - } - if r.HasTag(4, "patho", "idle") { - t.Errorf("Expected has not tag") - } - if !r.HasTag(5, "patho", "idle") { - t.Errorf("Expected has tag") + if counts["bandwidth"] != 0 { + t.Errorf("wrong tag count \ngot: %d \nwant: 0", counts["bandwidth"]) } } From 432e06e801f0f90cb9dab609e5b92d434fc58390 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:19:56 +0200 Subject: [PATCH 08/45] Add GoString method for jobmeta --- pkg/schema/job.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/schema/job.go b/pkg/schema/job.go index 5e3110b..df901b4 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -145,7 +145,12 @@ const ( JobStateOutOfMemory JobState = "out_of_memory" ) -func (e *JobState) UnmarshalGQL(v interface{}) error { +func (j JobMeta) GoString() string { + return fmt.Sprintf("JobMeta{ID:%d, StartTime:%d, JobID:%v, BaseJob:%v}", + j.ID, j.StartTime, j.JobID, j.BaseJob) +} + +func (e *JobState) UnmarshalGQL(v any) error { str, ok := v.(string) if !ok { return fmt.Errorf("SCHEMA/JOB > enums must be strings") From eab7961a83ef1604fc496d75d6cae95249dc815a Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:32:19 +0200 Subject: [PATCH 09/45] Introduce caching table for faster job inserts Fixes #392 --- internal/repository/job.go | 22 ++------- internal/repository/jobCreate.go | 47 ++++++++++++++++++- internal/repository/jobFind.go | 20 ++++++++ internal/repository/migration.go | 13 ++++- .../sqlite3/09_add-job-cache.down.sql | 1 + .../sqlite3/09_add-job-cache.up.sql | 31 ++++++++++++ 6 files changed, 114 insertions(+), 20 deletions(-) create mode 100644 internal/repository/migrations/sqlite3/09_add-job-cache.down.sql create mode 100644 internal/repository/migrations/sqlite3/09_add-job-cache.up.sql diff --git a/internal/repository/job.go b/internal/repository/job.go index 84de6f7..54a436a 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -9,12 +9,12 @@ import ( "encoding/json" "errors" "fmt" + "maps" "math" "strconv" "sync" "time" - "github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/lrucache" @@ -33,6 +33,7 @@ type JobRepository struct { stmtCache *sq.StmtCache cache *lrucache.Cache driver string + Mutex sync.Mutex } func GetJobRepository() *JobRepository { @@ -56,7 +57,7 @@ var jobColumns []string = []string{ "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", } -func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { +func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { job := &schema.Job{} if err := row.Scan( @@ -138,17 +139,6 @@ func (r *JobRepository) Flush() error { return nil } -func scanJobLink(row interface{ Scan(...interface{}) error }) (*model.JobLink, error) { - jobLink := &model.JobLink{} - if err := row.Scan( - &jobLink.ID, &jobLink.JobID); err != nil { - log.Warn("Error while scanning rows (jobLink)") - return nil, err - } - - return jobLink, nil -} - func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) { start := time.Now() cachekey := fmt.Sprintf("metadata:%d", job.ID) @@ -189,9 +179,7 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er if job.MetaData != nil { cpy := make(map[string]string, len(job.MetaData)+1) - for k, v := range job.MetaData { - cpy[k] = v - } + maps.Copy(cpy, job.MetaData) cpy[key] = val job.MetaData = cpy } else { @@ -389,7 +377,7 @@ func (r *JobRepository) FindColumnValues(user *schema.User, query string, table func (r *JobRepository) Partitions(cluster string) ([]string, error) { var err error start := time.Now() - partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) { + partitions := r.cache.Get("partitions:"+cluster, func() (any, time.Duration, int) { parts := []string{} if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil { return nil, 0, 1000 diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index 9e47974..3b997f3 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -13,6 +13,14 @@ import ( sq "github.com/Masterminds/squirrel" ) +const NamedJobCacheInsert string = `INSERT INTO job_cache ( + job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, + exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data +) VALUES ( + :job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, + :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data +);` + const NamedJobInsert string = `INSERT INTO job ( job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data @@ -22,7 +30,9 @@ const NamedJobInsert string = `INSERT INTO job ( );` func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { - res, err := r.DB.NamedExec(NamedJobInsert, job) + r.Mutex.Lock() + res, err := r.DB.NamedExec(NamedJobCacheInsert, job) + r.Mutex.Unlock() if err != nil { log.Warn("Error while NamedJobInsert") return 0, err @@ -36,6 +46,25 @@ func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { return id, nil } +func (r *JobRepository) SyncJobs() error { + r.Mutex.Lock() + defer r.Mutex.Unlock() + _, err := r.DB.Exec( + "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") + if err != nil { + log.Warnf("Error while Job sync: %v", err) + return err + } + + _, err = r.DB.Exec("DELETE FROM job_cache") + if err != nil { + log.Warn("Error while Job cache clean") + return err + } + + return nil +} + // Start inserts a new job in the table, returning the unique job ID. // Statistics are not transfered! func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) { @@ -73,3 +102,19 @@ func (r *JobRepository) Stop( _, err = stmt.RunWith(r.stmtCache).Exec() return } + +func (r *JobRepository) StopCached( + jobId int64, + duration int32, + state schema.JobState, + monitoringStatus int32, +) (err error) { + stmt := sq.Update("job_cache"). + Set("job_state", state). + Set("duration", duration). + Set("monitoring_status", monitoringStatus). + Where("job.id = ?", jobId) + + _, err = stmt.RunWith(r.stmtCache).Exec() + return +} diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index 1e2ccb8..ac09355 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -43,6 +43,26 @@ func (r *JobRepository) Find( return scanJob(q.RunWith(r.stmtCache).QueryRow()) } +func (r *JobRepository) FindCached( + jobId *int64, + cluster *string, + startTime *int64, +) (*schema.Job, error) { + q := sq.Select(jobColumns...).From("job_cache"). + Where("job_cache.job_id = ?", *jobId) + + if cluster != nil { + q = q.Where("job_cache.cluster = ?", *cluster) + } + if startTime != nil { + q = q.Where("job_cache.start_time = ?", *startTime) + } + + q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match + + return scanJob(q.RunWith(r.stmtCache).QueryRow()) +} + // Find executes a SQL query to find a specific batch job. // The job is queried using the batch job id, the cluster name, // and the start time of the job in UNIX epoch time seconds. diff --git a/internal/repository/migration.go b/internal/repository/migration.go index 0b2591e..c0693da 100644 --- a/internal/repository/migration.go +++ b/internal/repository/migration.go @@ -16,7 +16,7 @@ import ( "github.com/golang-migrate/migrate/v4/source/iofs" ) -const Version uint = 8 +const Version uint = 9 //go:embed migrations/* var migrationFiles embed.FS @@ -115,8 +115,17 @@ func MigrateDB(backend string, db string) error { } v, dirty, err := m.Version() + if err != nil { + if err == migrate.ErrNilVersion { + log.Warn("Legacy database without version or missing database file!") + } else { + return err + } + } - log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version) + if v < Version { + log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version) + } if dirty { return fmt.Errorf("last migration to version %d has failed, please fix the db manually and force version with -force-db flag", Version) diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.down.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.down.sql new file mode 100644 index 0000000..ef257cf --- /dev/null +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.down.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS job_cache; diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql new file mode 100644 index 0000000..7840369 --- /dev/null +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql @@ -0,0 +1,31 @@ +CREATE TABLE "job_cache" ( + id INTEGER PRIMARY KEY, + job_id BIGINT NOT NULL, + cluster VARCHAR(255) NOT NULL, + subcluster VARCHAR(255) NOT NULL, + start_time BIGINT NOT NULL, -- Unix timestamp + hpc_user VARCHAR(255) NOT NULL, + project VARCHAR(255) NOT NULL, + cluster_partition VARCHAR(255), + array_job_id BIGINT, + duration INT NOT NULL, + walltime INT NOT NULL, + job_state VARCHAR(255) NOT NULL + CHECK (job_state IN ( + 'running', 'completed', 'failed', 'cancelled', + 'stopped', 'timeout', 'preempted', 'out_of_memory' + )), + meta_data TEXT, -- JSON + resources TEXT NOT NULL, -- JSON + num_nodes INT NOT NULL, + num_hwthreads INT, + num_acc INT, + smt TINYINT NOT NULL DEFAULT 1 CHECK (smt IN (0, 1)), + exclusive TINYINT NOT NULL DEFAULT 1 CHECK (exclusive IN (0, 1, 2)), + monitoring_status TINYINT NOT NULL DEFAULT 1 + CHECK (monitoring_status IN (0, 1, 2, 3)), + energy REAL NOT NULL DEFAULT 0.0, + energy_footprint TEXT DEFAULT NULL, + footprint TEXT DEFAULT NULL, + UNIQUE (job_id, cluster, start_time) +); From 40110580e080fbd1188ba7ef3e2154f2f97ef768 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:33:44 +0200 Subject: [PATCH 10/45] feat: add job hook support Fixes #394 --- internal/repository/jobHooks.go | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 internal/repository/jobHooks.go diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go new file mode 100644 index 0000000..d69874f --- /dev/null +++ b/internal/repository/jobHooks.go @@ -0,0 +1,34 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +type JobHook interface { + jobStartCallback() + jobStopCallback() +} + +var hooks []JobHook + +func RegisterJobJook(hook JobHook) { + if hook != nil { + hooks = append(hooks, hook) + } +} + +func CallJobStartHooks() { + for _, hook := range hooks { + if hook != nil { + hook.jobStartCallback() + } + } +} + +func CallJobStopHooks() { + for _, hook := range hooks { + if hook != nil { + hook.jobStopCallback() + } + } +} From d76b1ae75dc1a98395dfb05b722ce17a1f623800 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:36:33 +0200 Subject: [PATCH 11/45] feat: add job commit service Sync jobs from job cache table to main job table. Enables #392 --- internal/taskManager/commitJobService.go | 35 ++++++++++++++++++++++++ internal/taskManager/taskManager.go | 1 + pkg/schema/config.go | 4 ++- 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 internal/taskManager/commitJobService.go diff --git a/internal/taskManager/commitJobService.go b/internal/taskManager/commitJobService.go new file mode 100644 index 0000000..7749348 --- /dev/null +++ b/internal/taskManager/commitJobService.go @@ -0,0 +1,35 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package taskManager + +import ( + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/go-co-op/gocron/v2" +) + +func RegisterCommitJobService() { + var frequency string + if config.Keys.CronFrequency != nil && config.Keys.CronFrequency.CommitJobWorker != "" { + frequency = config.Keys.CronFrequency.CommitJobWorker + } else { + frequency = "2m" + } + d, _ := time.ParseDuration(frequency) + log.Infof("Register commitJob service with %s interval", frequency) + + s.NewJob(gocron.DurationJob(d), + gocron.NewTask( + func() { + start := time.Now() + log.Printf("Jobcache sync started at %s", start.Format(time.RFC3339)) + jobRepo.SyncJobs() + repository.CallJobStartHooks() + log.Printf("Jobcache sync is done and took %s", time.Since(start)) + })) +} diff --git a/internal/taskManager/taskManager.go b/internal/taskManager/taskManager.go index 2004e0d..7d9a3a2 100644 --- a/internal/taskManager/taskManager.go +++ b/internal/taskManager/taskManager.go @@ -81,6 +81,7 @@ func Start() { RegisterFootprintWorker() RegisterUpdateDurationWorker() + RegisterCommitJobService() s.Start() } diff --git a/pkg/schema/config.go b/pkg/schema/config.go index 27d11be..a5caa61 100644 --- a/pkg/schema/config.go +++ b/pkg/schema/config.go @@ -89,6 +89,8 @@ type ResampleConfig struct { } type CronFrequency struct { + // Duration Update Worker [Defaults to '2m'] + CommitJobWorker string `json:"commit-job-worker"` // Duration Update Worker [Defaults to '5m'] DurationWorker string `json:"duration-worker"` // Metric-Footprint Update Worker [Defaults to '10m'] @@ -150,7 +152,7 @@ type ProgramConfig struct { // If overwritten, at least all the options in the defaults below must // be provided! Most options here can be overwritten by the user. - UiDefaults map[string]interface{} `json:"ui-defaults"` + UiDefaults map[string]any `json:"ui-defaults"` // If exists, will enable dynamic zoom in frontend metric plots using the configured values EnableResampling *ResampleConfig `json:"enable-resampling"` From 2e781b900d4b5b9b50a212f53dd38759cc598bf7 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:37:24 +0200 Subject: [PATCH 12/45] Staged error handling for job cache --- internal/api/rest.go | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/internal/api/rest.go b/internal/api/rest.go index 669768e..e0804cb 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -820,7 +820,7 @@ func (api *RestApi) removeTags(rw http.ResponseWriter, r *http.Request) { } rw.WriteHeader(http.StatusOK) - rw.Write([]byte(fmt.Sprintf("Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount))) + fmt.Fprintf(rw, "Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount) } // startJob godoc @@ -846,6 +846,7 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { return } + log.Printf("REST: %s\n", req.GoString()) req.State = schema.JobStateRunning if err := importer.SanityChecks(&req.BaseJob); err != nil { @@ -931,8 +932,12 @@ func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { // log.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req) job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) if err != nil { - handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) - return + job, err = api.JobRepository.FindCached(req.JobId, req.Cluster, req.StartTime) + // FIXME: Previous error is hidden + if err != nil { + handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } } api.checkAndHandleStopJob(rw, job, req) @@ -1097,10 +1102,15 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo // Mark job as stopped in the database (update state and duration) job.Duration = int32(req.StopTime - job.StartTime.Unix()) job.State = req.State + api.JobRepository.Mutex.Lock() if err := api.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) - return + if err := api.JobRepository.StopCached(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + api.JobRepository.Mutex.Unlock() + handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + return + } } + api.JobRepository.Mutex.Unlock() log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) @@ -1116,6 +1126,8 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo return } + repository.CallJobStopHooks() + // Trigger async archiving archiver.TriggerArchiving(job) } From f06b5f8fc0c6196f4caa31cab856b3ad992803f3 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:37:36 +0200 Subject: [PATCH 13/45] Refactor --- internal/auth/auth.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/auth/auth.go b/internal/auth/auth.go index 5f88bbb..3e57768 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -237,7 +237,7 @@ func (auth *Authentication) Login( limiter := getIPUserLimiter(ip, username) if !limiter.Allow() { log.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username) - onfailure(rw, r, errors.New("Too many login attempts, try again in a few minutes.")) + onfailure(rw, r, errors.New("too many login attempts, try again in a few minutes")) return } From f30b784f45baf0b943be0176818d7e5728e70db3 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 16 May 2025 17:38:00 +0200 Subject: [PATCH 14/45] Attempt to fix api test Tests still fail --- internal/api/api_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/internal/api/api_test.go b/internal/api/api_test.go index e67813c..2e864a3 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -253,6 +253,7 @@ func TestRestApi(t *testing.T) { t.Fatal(response.Status, recorder.Body.String()) } resolver := graph.GetResolverInstance() + restapi.JobRepository.SyncJobs() job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) @@ -312,7 +313,7 @@ func TestRestApi(t *testing.T) { } archiver.WaitForArchiving() - job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) + job, err := restapi.JobRepository.FindCached(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } @@ -425,7 +426,7 @@ func TestRestApi(t *testing.T) { archiver.WaitForArchiving() jobid, cluster := int64(12345), "testcluster" - job, err := restapi.JobRepository.Find(&jobid, &cluster, nil) + job, err := restapi.JobRepository.FindCached(&jobid, &cluster, nil) if err != nil { t.Fatal(err) } From 99f8187092d6aab9d1c9c20399b9a2a1cfb7202e Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 19 May 2025 09:17:16 +0200 Subject: [PATCH 15/45] Port tests to new architecture --- internal/api/api_test.go | 30 +++++++++++++++--------------- internal/importer/importer_test.go | 2 +- internal/repository/job.go | 11 ++++++++--- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 2e864a3..3af37ad 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -123,7 +123,7 @@ func setup(t *testing.T) *api.RestApi { t.Fatal(err) } - if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil { + if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 2), 0666); err != nil { t.Fatal(err) } @@ -204,11 +204,11 @@ func TestRestApi(t *testing.T) { restapi.MountApiRoutes(r) var TestJobId int64 = 123 - var TestClusterName string = "testcluster" + TestClusterName := "testcluster" var TestStartTime int64 = 123456789 const startJobBody string = `{ - "jobId": 123, + "jobId": 123, "user": "testuser", "project": "testproj", "cluster": "testcluster", @@ -221,7 +221,6 @@ func TestRestApi(t *testing.T) { "exclusive": 1, "monitoringStatus": 1, "smt": 1, - "tags": [{ "type": "testTagType", "name": "testTagName", "scope": "testuser" }], "resources": [ { "hostname": "host123", @@ -252,17 +251,17 @@ func TestRestApi(t *testing.T) { if response.StatusCode != http.StatusCreated { t.Fatal(response.Status, recorder.Body.String()) } - resolver := graph.GetResolverInstance() + // resolver := graph.GetResolverInstance() restapi.JobRepository.SyncJobs() job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } - job.Tags, err = resolver.Job().Tags(ctx, job) - if err != nil { - t.Fatal(err) - } + // job.Tags, err = resolver.Job().Tags(ctx, job) + // if err != nil { + // t.Fatal(err) + // } if job.JobID != 123 || job.User != "testuser" || @@ -283,9 +282,9 @@ func TestRestApi(t *testing.T) { t.Fatalf("unexpected job properties: %#v", job) } - if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" { - t.Fatalf("unexpected tags: %#v", job.Tags) - } + // if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" { + // t.Fatalf("unexpected tags: %#v", job.Tags) + // } }); !ok { return } @@ -313,7 +312,7 @@ func TestRestApi(t *testing.T) { } archiver.WaitForArchiving() - job, err := restapi.JobRepository.FindCached(&TestJobId, &TestClusterName, &TestStartTime) + job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } @@ -353,7 +352,7 @@ func TestRestApi(t *testing.T) { t.Run("CheckDoubleStart", func(t *testing.T) { // Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart! - body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1) + body := strings.ReplaceAll(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`) req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body))) recorder := httptest.NewRecorder() @@ -403,6 +402,7 @@ func TestRestApi(t *testing.T) { } time.Sleep(1 * time.Second) + restapi.JobRepository.SyncJobs() const stopJobBodyFailed string = `{ "jobId": 12345, @@ -426,7 +426,7 @@ func TestRestApi(t *testing.T) { archiver.WaitForArchiving() jobid, cluster := int64(12345), "testcluster" - job, err := restapi.JobRepository.FindCached(&jobid, &cluster, nil) + job, err := restapi.JobRepository.Find(&jobid, &cluster, nil) if err != nil { t.Fatal(err) } diff --git a/internal/importer/importer_test.go b/internal/importer/importer_test.go index 209b6be..d2bb0b4 100644 --- a/internal/importer/importer_test.go +++ b/internal/importer/importer_test.go @@ -166,7 +166,7 @@ func TestHandleImportFlag(t *testing.T) { } result := readResult(t, testname) - job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime) + job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime) if err != nil { t.Fatal(err) } diff --git a/internal/repository/job.go b/internal/repository/job.go index 54a436a..29aa63e 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -51,10 +51,15 @@ func GetJobRepository() *JobRepository { return jobRepoInstance } +// var jobColumns []string = []string{ +// "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", +// "job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", +// "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", +// } + var jobColumns []string = []string{ - "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", - "job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", - "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", + "id", "job_id", "hpc_user", "project", "cluster", "subcluster", "start_time", "cluster_partition", "array_job_id", "num_nodes", "num_hwthreads", "num_acc", "exclusive", "monitoring_status", "smt", "job_state", + "duration", "walltime", "resources", "footprint", "energy", } func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { From 14bad81b9fd46ceca683aaffbe64566fb7b37972 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 19 May 2025 13:25:39 +0200 Subject: [PATCH 16/45] Extend Job Hooks and add unit tests Add job tagger control --- internal/api/rest.go | 2 +- internal/repository/jobCreate.go | 32 +++++++++++++++---- internal/repository/jobHooks.go | 37 +++++++++++++++++----- internal/tagger/apps/python.txt | 3 ++ internal/tagger/detectApp.go | 28 +++++++++++------ internal/tagger/detectApp_test.go | 2 +- internal/tagger/tagger.go | 40 ++++++++++++++++++++++-- internal/tagger/tagger_test.go | 31 ++++++++++++++++++ internal/taskManager/commitJobService.go | 6 ++-- 9 files changed, 150 insertions(+), 31 deletions(-) create mode 100644 internal/tagger/apps/python.txt create mode 100644 internal/tagger/tagger_test.go diff --git a/internal/api/rest.go b/internal/api/rest.go index e0804cb..6133a5e 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -1126,7 +1126,7 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo return } - repository.CallJobStopHooks() + repository.CallJobStopHooks(job) // Trigger async archiving archiver.TriggerArchiving(job) diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index 3b997f3..a651db9 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -46,23 +46,43 @@ func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { return id, nil } -func (r *JobRepository) SyncJobs() error { +func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { r.Mutex.Lock() defer r.Mutex.Unlock() - _, err := r.DB.Exec( + + query := sq.Select(jobColumns...).From("job_cache") + + rows, err := query.RunWith(r.stmtCache).Query() + if err != nil { + log.Errorf("Error while running query %v", err) + return nil, err + } + + jobs := make([]*schema.Job, 0, 50) + for rows.Next() { + job, err := scanJob(rows) + if err != nil { + rows.Close() + log.Warn("Error while scanning rows") + return nil, err + } + jobs = append(jobs, job) + } + + _, err = r.DB.Exec( "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") if err != nil { log.Warnf("Error while Job sync: %v", err) - return err + return nil, err } _, err = r.DB.Exec("DELETE FROM job_cache") if err != nil { - log.Warn("Error while Job cache clean") - return err + log.Warnf("Error while Job cache clean: %v", err) + return nil, err } - return nil + return jobs, nil } // Start inserts a new job in the table, returning the unique job ID. diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index d69874f..1016335 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -4,31 +4,54 @@ // license that can be found in the LICENSE file. package repository +import ( + "sync" + + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + type JobHook interface { - jobStartCallback() - jobStopCallback() + JobStartCallback(job *schema.Job) + JobStopCallback(job *schema.Job) } -var hooks []JobHook +var ( + initOnce sync.Once + hooks []JobHook +) func RegisterJobJook(hook JobHook) { + initOnce.Do(func() { + hooks = make([]JobHook, 0) + }) + if hook != nil { hooks = append(hooks, hook) } } -func CallJobStartHooks() { +func CallJobStartHooks(jobs []*schema.Job) { + if hooks == nil { + return + } + for _, hook := range hooks { if hook != nil { - hook.jobStartCallback() + for _, job := range jobs { + hook.JobStartCallback(job) + } } } } -func CallJobStopHooks() { +func CallJobStopHooks(job *schema.Job) { + if hooks == nil { + return + } + for _, hook := range hooks { if hook != nil { - hook.jobStopCallback() + hook.JobStopCallback(job) } } } diff --git a/internal/tagger/apps/python.txt b/internal/tagger/apps/python.txt new file mode 100644 index 0000000..7a5c661 --- /dev/null +++ b/internal/tagger/apps/python.txt @@ -0,0 +1,3 @@ +python +anaconda +conda diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 339e398..44a08e0 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -8,6 +8,7 @@ import ( "bufio" "embed" "fmt" + "io/fs" "path/filepath" "strings" @@ -27,16 +28,10 @@ type appInfo struct { } type AppTagger struct { - apps []appInfo + apps map[string]appInfo } -func (t *AppTagger) Register() error { - files, err := appFiles.ReadDir("apps") - if err != nil { - return fmt.Errorf("error reading app folder: %#v", err) - } - t.apps = make([]appInfo, 0) - +func (t *AppTagger) scanApps(files []fs.DirEntry) error { for _, fn := range files { fns := fn.Name() log.Debugf("Process: %s", fns) @@ -50,12 +45,25 @@ func (t *AppTagger) Register() error { for scanner.Scan() { ai.strings = append(ai.strings, scanner.Text()) } - t.apps = append(t.apps, ai) + delete(t.apps, ai.tag) + t.apps[ai.tag] = ai } - return nil } +// func (t *AppTagger) Reload() error { +// +// } + +func (t *AppTagger) Register() error { + files, err := appFiles.ReadDir("apps") + if err != nil { + return fmt.Errorf("error reading app folder: %#v", err) + } + t.apps = make(map[string]appInfo, 0) + return t.scanApps(files) +} + func (t *AppTagger) Match(job *schema.Job) { r := repository.GetJobRepository() meta, err := r.FetchMetadata(job) diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go index 8978e35..3b43cce 100644 --- a/internal/tagger/detectApp_test.go +++ b/internal/tagger/detectApp_test.go @@ -35,7 +35,7 @@ func TestRegister(t *testing.T) { err := tagger.Register() noErr(t, err) - if len(tagger.apps) != 3 { + if len(tagger.apps) != 4 { t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 3", len(tagger.apps)) } } diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index 52a369b..4fbbc9e 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -4,14 +4,48 @@ // license that can be found in the LICENSE file. package tagger -import "github.com/ClusterCockpit/cc-backend/pkg/schema" +import ( + "sync" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) type Tagger interface { Register() error Match(job *schema.Job) } -func Init() error { +var ( + initOnce sync.Once + jobTagger *JobTagger +) - return nil +type JobTagger struct { + startTaggers []Tagger + stopTaggers []Tagger +} + +func Init() { + initOnce.Do(func() { + jobTagger = &JobTagger{} + jobTagger.startTaggers = make([]Tagger, 0) + jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) + + for _, tagger := range jobTagger.startTaggers { + tagger.Register() + } + + // jobTagger.stopTaggers = make([]Tagger, 0) + repository.RegisterJobJook(jobTagger) + }) +} + +func (jt *JobTagger) JobStartCallback(job *schema.Job) { + for _, tagger := range jobTagger.startTaggers { + tagger.Match(job) + } +} + +func (jt *JobTagger) JobStopCallback(job *schema.Job) { } diff --git a/internal/tagger/tagger_test.go b/internal/tagger/tagger_test.go new file mode 100644 index 0000000..057ca17 --- /dev/null +++ b/internal/tagger/tagger_test.go @@ -0,0 +1,31 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "testing" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +func TestInit(t *testing.T) { + Init() +} + +func TestJobStartCallback(t *testing.T) { + Init() + r := setup(t) + job, err := r.FindByIdDirect(2) + noErr(t, err) + + jobs := make([]*schema.Job, 0, 1) + jobs = append(jobs, job) + + repository.CallJobStartHooks(jobs) + if !r.HasTag(2, "app", "python") { + t.Errorf("missing tag python") + } +} diff --git a/internal/taskManager/commitJobService.go b/internal/taskManager/commitJobService.go index 7749348..c60acb3 100644 --- a/internal/taskManager/commitJobService.go +++ b/internal/taskManager/commitJobService.go @@ -28,8 +28,8 @@ func RegisterCommitJobService() { func() { start := time.Now() log.Printf("Jobcache sync started at %s", start.Format(time.RFC3339)) - jobRepo.SyncJobs() - repository.CallJobStartHooks() - log.Printf("Jobcache sync is done and took %s", time.Since(start)) + jobs, _ := jobRepo.SyncJobs() + repository.CallJobStartHooks(jobs) + log.Printf("Jobcache sync and job callbacks are done and took %s", time.Since(start)) })) } From 85f17c0fd85fff07e14009f85566927955477f25 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 19 May 2025 16:08:43 +0200 Subject: [PATCH 17/45] Refactor Tagger package. Add fsNotify Service --- cmd/cc-backend/main.go | 5 +++ go.mod | 1 + go.sum | 2 + internal/repository/job.go | 6 ++- internal/tagger/detectApp.go | 64 ++++++++++++++++++++++--------- internal/tagger/tagger.go | 1 - internal/util/fswatcher.go | 73 ++++++++++++++++++++++++++++++++++++ 7 files changed, 131 insertions(+), 21 deletions(-) create mode 100644 internal/util/fswatcher.go diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 4b6d7f9..cbfccef 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -19,7 +19,9 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/importer" "github.com/ClusterCockpit/cc-backend/internal/metricdata" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/tagger" "github.com/ClusterCockpit/cc-backend/internal/taskManager" + "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv" @@ -216,6 +218,7 @@ func main() { } archiver.Start(repository.GetJobRepository()) + tagger.Init() taskManager.Start() serverInit() @@ -237,6 +240,8 @@ func main() { serverShutdown() + util.FsWatcherShutdown() + taskManager.Shutdown() }() diff --git a/go.mod b/go.mod index 98d1cab..f17ec18 100644 --- a/go.mod +++ b/go.mod @@ -44,6 +44,7 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect github.com/go-jose/go-jose/v4 v4.0.5 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect diff --git a/go.sum b/go.sum index a76e112..57b1649 100644 --- a/go.sum +++ b/go.sum @@ -55,6 +55,8 @@ github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4 github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-asn1-ber/asn1-ber v1.5.7 h1:DTX+lbVTWaTw1hQ+PbZPlnDZPEIs0SS/GCZAl535dDk= github.com/go-asn1-ber/asn1-ber v1.5.7/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-co-op/gocron/v2 v2.16.0 h1:uqUF6WFZ4enRU45pWFNcn1xpDLc+jBOTKhPQI16Z1xs= diff --git a/internal/repository/job.go b/internal/repository/job.go index 29aa63e..73a2588 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -58,8 +58,10 @@ func GetJobRepository() *JobRepository { // } var jobColumns []string = []string{ - "id", "job_id", "hpc_user", "project", "cluster", "subcluster", "start_time", "cluster_partition", "array_job_id", "num_nodes", "num_hwthreads", "num_acc", "exclusive", "monitoring_status", "smt", "job_state", - "duration", "walltime", "resources", "footprint", "energy", + "id", "job_id", "hpc_user", "project", "cluster", "subcluster", "start_time", + "cluster_partition", "array_job_id", "num_nodes", "num_hwthreads", "num_acc", + "exclusive", "monitoring_status", "smt", "job_state", "duration", "walltime", + "resources", "footprint", "energy", } func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 44a08e0..621e20c 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -9,15 +9,20 @@ import ( "embed" "fmt" "io/fs" + "os" "path/filepath" "strings" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" ) -const tagType = "app" +const ( + tagType = "app" + appPath = "./var/tagger/apps" +) //go:embed apps/* var appFiles embed.FS @@ -31,37 +36,60 @@ type AppTagger struct { apps map[string]appInfo } -func (t *AppTagger) scanApps(files []fs.DirEntry) error { +func (t *AppTagger) scanApp(f fs.File, fns string) { + scanner := bufio.NewScanner(f) + ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)} + + for scanner.Scan() { + ai.strings = append(ai.strings, scanner.Text()) + } + delete(t.apps, ai.tag) + t.apps[ai.tag] = ai +} + +func (t *AppTagger) EventMatch(s string) bool { + return strings.Contains(s, "apps") +} + +func (t *AppTagger) EventCallback() { + files, err := os.ReadDir(appPath) + if err != nil { + log.Fatal(err) + } + for _, fn := range files { fns := fn.Name() log.Debugf("Process: %s", fns) - f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + f, err := os.Open(fmt.Sprintf("%s/%s", appPath, fns)) if err != nil { - return fmt.Errorf("error opening app file %s: %#v", fns, err) + log.Errorf("error opening app file %s: %#v", fns, err) } - scanner := bufio.NewScanner(f) - ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)} - - for scanner.Scan() { - ai.strings = append(ai.strings, scanner.Text()) - } - delete(t.apps, ai.tag) - t.apps[ai.tag] = ai + t.scanApp(f, fns) } - return nil } -// func (t *AppTagger) Reload() error { -// -// } - func (t *AppTagger) Register() error { files, err := appFiles.ReadDir("apps") if err != nil { return fmt.Errorf("error reading app folder: %#v", err) } t.apps = make(map[string]appInfo, 0) - return t.scanApps(files) + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + if err != nil { + return fmt.Errorf("error opening app file %s: %#v", fns, err) + } + t.scanApp(f, fns) + } + + if util.CheckFileExists(appPath) { + log.Infof("Setup file watch for %s", appPath) + util.AddListener(appPath, t) + } + + return nil } func (t *AppTagger) Match(job *schema.Job) { diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index 4fbbc9e..b336125 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -36,7 +36,6 @@ func Init() { tagger.Register() } - // jobTagger.stopTaggers = make([]Tagger, 0) repository.RegisterJobJook(jobTagger) }) } diff --git a/internal/util/fswatcher.go b/internal/util/fswatcher.go new file mode 100644 index 0000000..aaf3372 --- /dev/null +++ b/internal/util/fswatcher.go @@ -0,0 +1,73 @@ +// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package util + +import ( + "sync" + + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/fsnotify/fsnotify" +) + +type Listener interface { + EventCallback() + EventMatch(event string) bool +} + +var ( + initOnce sync.Once + w *fsnotify.Watcher + listeners []Listener +) + +func AddListener(path string, l Listener) { + var err error + + initOnce.Do(func() { + var err error + w, err = fsnotify.NewWatcher() + if err != nil { + log.Error("creating a new watcher: %w", err) + } + listeners = make([]Listener, 0) + + go watchLoop(w) + }) + + listeners = append(listeners, l) + err = w.Add(path) + if err != nil { + log.Warnf("%q: %s", path, err) + } +} + +func FsWatcherShutdown() { + w.Close() +} + +func watchLoop(w *fsnotify.Watcher) { + for { + select { + // Read from Errors. + case err, ok := <-w.Errors: + if !ok { // Channel was closed (i.e. Watcher.Close() was called). + return + } + log.Errorf("watch event loop: %s", err) + // Read from Events. + case e, ok := <-w.Events: + if !ok { // Channel was closed (i.e. Watcher.Close() was called). + return + } + + log.Infof("Event %s", e) + for _, l := range listeners { + if l.EventMatch(e.String()) { + l.EventCallback() + } + } + } + } +} From 9abc206d1af849d86e737c5219f6ab248e830e50 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 20 May 2025 07:10:15 +0200 Subject: [PATCH 18/45] Read in tagger config on startup. Safeguard watcher shutdown --- internal/tagger/detectApp.go | 2 ++ internal/util/fswatcher.go | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 621e20c..d3d797d 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -51,6 +51,7 @@ func (t *AppTagger) EventMatch(s string) bool { return strings.Contains(s, "apps") } +// FIXME: Only process the file that caused the event func (t *AppTagger) EventCallback() { files, err := os.ReadDir(appPath) if err != nil { @@ -85,6 +86,7 @@ func (t *AppTagger) Register() error { } if util.CheckFileExists(appPath) { + t.EventCallback() log.Infof("Setup file watch for %s", appPath) util.AddListener(appPath, t) } diff --git a/internal/util/fswatcher.go b/internal/util/fswatcher.go index aaf3372..5d13462 100644 --- a/internal/util/fswatcher.go +++ b/internal/util/fswatcher.go @@ -44,7 +44,9 @@ func AddListener(path string, l Listener) { } func FsWatcherShutdown() { - w.Close() + if w != nil { + w.Close() + } } func watchLoop(w *fsnotify.Watcher) { From ca634bb70741697f7f34cd21df998d1100b2ff76 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 22 May 2025 07:10:41 +0200 Subject: [PATCH 19/45] Refactor taggers. Refine Job Hooks. Start job classifier --- go.mod | 1 + go.sum | 2 + internal/api/rest.go | 2 - internal/archiver/archiveWorker.go | 7 ++ internal/repository/jobHooks.go | 4 +- internal/tagger/classifyJob.go | 121 +++++++++++++++++++++ internal/tagger/detectApp.go | 35 +++--- internal/tagger/detectApp_test.go | 8 +- internal/tagger/jobclasses/highload.json | 38 +++++++ internal/tagger/jobclasses/highmem.json | 40 +++++++ internal/tagger/jobclasses/lowgpuload.json | 36 ++++++ internal/tagger/jobclasses/lowload.json | 38 +++++++ internal/tagger/tagger.go | 6 +- pkg/archive/fsBackend.go | 39 +------ 14 files changed, 316 insertions(+), 61 deletions(-) create mode 100644 internal/tagger/classifyJob.go create mode 100644 internal/tagger/jobclasses/highload.json create mode 100644 internal/tagger/jobclasses/highmem.json create mode 100644 internal/tagger/jobclasses/lowgpuload.json create mode 100644 internal/tagger/jobclasses/lowload.json diff --git a/go.mod b/go.mod index f17ec18..062ee3e 100644 --- a/go.mod +++ b/go.mod @@ -43,6 +43,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect + github.com/expr-lang/expr v1.17.3 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect diff --git a/go.sum b/go.sum index 57b1649..b4c3781 100644 --- a/go.sum +++ b/go.sum @@ -53,6 +53,8 @@ github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/expr-lang/expr v1.17.3 h1:myeTTuDFz7k6eFe/JPlep/UsiIjVhG61FMHFu63U7j0= +github.com/expr-lang/expr v1.17.3/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= diff --git a/internal/api/rest.go b/internal/api/rest.go index 6133a5e..fe35942 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -1126,8 +1126,6 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo return } - repository.CallJobStopHooks(job) - // Trigger async archiving archiver.TriggerArchiving(job) } diff --git a/internal/archiver/archiveWorker.go b/internal/archiver/archiveWorker.go index 628e36e..6e514cb 100644 --- a/internal/archiver/archiveWorker.go +++ b/internal/archiver/archiveWorker.go @@ -72,7 +72,14 @@ func archivingWorker() { } log.Debugf("archiving job %d took %s", job.JobID, time.Since(start)) log.Printf("archiving job (dbid: %d) successful", job.ID) + + id := job.ID + jobMeta.ID = &id + + repository.CallJobStopHooks(jobMeta) archivePending.Done() + default: + continue } } } diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index 1016335..49535f7 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -12,7 +12,7 @@ import ( type JobHook interface { JobStartCallback(job *schema.Job) - JobStopCallback(job *schema.Job) + JobStopCallback(job *schema.JobMeta) } var ( @@ -44,7 +44,7 @@ func CallJobStartHooks(jobs []*schema.Job) { } } -func CallJobStopHooks(job *schema.Job) { +func CallJobStopHooks(job *schema.JobMeta) { if hooks == nil { return } diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go new file mode 100644 index 0000000..ec1e843 --- /dev/null +++ b/internal/tagger/classifyJob.go @@ -0,0 +1,121 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "bytes" + "embed" + "fmt" + "io/fs" + "os" + "path/filepath" + "strings" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/util" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + "github.com/expr-lang/expr" + "github.com/expr-lang/expr/vm" +) + +//go:embed jobclasses/* +var jobclassFiles embed.FS + +type ruleInfo struct { + tag string + rule *vm.Program +} + +type JobClassTagger struct { + rules map[string]ruleInfo + tagType string + cfgPath string +} + +func (t *JobClassTagger) compileRule(f fs.File, fns string) { + buf := new(bytes.Buffer) + _, err := buf.ReadFrom(f) + if err != nil { + log.Errorf("error reading rule file %s: %#v", fns, err) + } + prg, err := expr.Compile(buf.String(), expr.AsBool()) + if err != nil { + log.Errorf("error compiling rule %s: %#v", fns, err) + } + ri := ruleInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), rule: prg} + + delete(t.rules, ri.tag) + t.rules[ri.tag] = ri +} + +func (t *JobClassTagger) EventMatch(s string) bool { + return strings.Contains(s, "jobclasses") +} + +// FIXME: Only process the file that caused the event +func (t *JobClassTagger) EventCallback() { + files, err := os.ReadDir(t.cfgPath) + if err != nil { + log.Fatal(err) + } + + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns)) + if err != nil { + log.Errorf("error opening app file %s: %#v", fns, err) + } + t.compileRule(f, fns) + } +} + +func (t *JobClassTagger) Register() error { + t.cfgPath = "./var/tagger/jobclasses" + t.tagType = "jobClass" + + files, err := appFiles.ReadDir("jobclasses") + if err != nil { + return fmt.Errorf("error reading app folder: %#v", err) + } + t.rules = make(map[string]ruleInfo, 0) + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + if err != nil { + return fmt.Errorf("error opening app file %s: %#v", fns, err) + } + defer f.Close() + t.compileRule(f, fns) + } + + if util.CheckFileExists(t.cfgPath) { + t.EventCallback() + log.Infof("Setup file watch for %s", t.cfgPath) + util.AddListener(t.cfgPath, t) + } + + return nil +} + +func (t *JobClassTagger) Match(job *schema.JobMeta) { + r := repository.GetJobRepository() + + for _, ri := range t.rules { + tag := ri.tag + output, err := expr.Run(ri.rule, job) + if err != nil { + log.Errorf("error running rule %s: %#v", tag, err) + } + if output.(bool) { + id := job.ID + if !r.HasTag(*id, t.tagType, tag) { + r.AddTagOrCreateDirect(*id, t.tagType, tag) + } + } + } +} diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index d3d797d..8057aad 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -19,11 +19,6 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/schema" ) -const ( - tagType = "app" - appPath = "./var/tagger/apps" -) - //go:embed apps/* var appFiles embed.FS @@ -33,7 +28,9 @@ type appInfo struct { } type AppTagger struct { - apps map[string]appInfo + apps map[string]appInfo + tagType string + cfgPath string } func (t *AppTagger) scanApp(f fs.File, fns string) { @@ -53,7 +50,7 @@ func (t *AppTagger) EventMatch(s string) bool { // FIXME: Only process the file that caused the event func (t *AppTagger) EventCallback() { - files, err := os.ReadDir(appPath) + files, err := os.ReadDir(t.cfgPath) if err != nil { log.Fatal(err) } @@ -61,7 +58,7 @@ func (t *AppTagger) EventCallback() { for _, fn := range files { fns := fn.Name() log.Debugf("Process: %s", fns) - f, err := os.Open(fmt.Sprintf("%s/%s", appPath, fns)) + f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns)) if err != nil { log.Errorf("error opening app file %s: %#v", fns, err) } @@ -70,6 +67,9 @@ func (t *AppTagger) EventCallback() { } func (t *AppTagger) Register() error { + t.cfgPath = "./var/tagger/apps" + t.tagType = "app" + files, err := appFiles.ReadDir("apps") if err != nil { return fmt.Errorf("error reading app folder: %#v", err) @@ -79,28 +79,25 @@ func (t *AppTagger) Register() error { fns := fn.Name() log.Debugf("Process: %s", fns) f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + defer f.Close() if err != nil { return fmt.Errorf("error opening app file %s: %#v", fns, err) } t.scanApp(f, fns) } - if util.CheckFileExists(appPath) { + if util.CheckFileExists(t.cfgPath) { t.EventCallback() - log.Infof("Setup file watch for %s", appPath) - util.AddListener(appPath, t) + log.Infof("Setup file watch for %s", t.cfgPath) + util.AddListener(t.cfgPath, t) } return nil } -func (t *AppTagger) Match(job *schema.Job) { +func (t *AppTagger) Match(job *schema.JobMeta) { r := repository.GetJobRepository() - meta, err := r.FetchMetadata(job) - if err != nil { - log.Error("cannot fetch meta data") - } - jobscript, ok := meta["jobScript"] + jobscript, ok := job.MetaData["jobScript"] if ok { id := job.ID @@ -109,8 +106,8 @@ func (t *AppTagger) Match(job *schema.Job) { tag := a.tag for _, s := range a.strings { if strings.Contains(jobscript, s) { - if !r.HasTag(id, tagType, tag) { - r.AddTagOrCreateDirect(id, tagType, tag) + if !r.HasTag(*id, t.tagType, tag) { + r.AddTagOrCreateDirect(*id, t.tagType, tag) break out } } diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go index 3b43cce..56bd856 100644 --- a/internal/tagger/detectApp_test.go +++ b/internal/tagger/detectApp_test.go @@ -9,6 +9,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" ) func setup(tb testing.TB) *repository.JobRepository { @@ -51,7 +52,12 @@ func TestMatch(t *testing.T) { err = tagger.Register() noErr(t, err) - tagger.Match(job) + jobMeta := &schema.JobMeta{ + ID: &job.ID, + BaseJob: job.BaseJob, + StartTime: job.StartTime.Unix(), + } + tagger.Match(jobMeta) if !r.HasTag(5, "app", "vasp") { t.Errorf("missing tag vasp") diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json new file mode 100644 index 0000000..a65f400 --- /dev/null +++ b/internal/tagger/jobclasses/highload.json @@ -0,0 +1,38 @@ +{ + "name": "Excessive CPU load", + "tag": "excessiveload", + "comment": "Assumptions: all nodes have the same number of cores.", + "parameters": [ + "excessivecpuload_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": [ + "cpu_load" + ], + "requirements": [ + "job.exclusive == 1", + "job.duration > job_min_duration_seconds", + "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + ], + "terms": [ + { + "load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')" + }, + { + "load_threshold": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor" + }, + { + "highload_nodes": "load_mean > load_threshold" + }, + { + "highload": "highload_nodes.any('all')" + }, + { + "load_perc": "load_mean / load_threshold" + } + ], + "output": "highload", + "output_scalar": "load_perc", + "template": "Job ({{ job.jobId }})\nThis job was detected as excessiveload because the mean cpu load {{ load_mean.array }} falls above the threshold {{ load_threshold }}." +} diff --git a/internal/tagger/jobclasses/highmem.json b/internal/tagger/jobclasses/highmem.json new file mode 100644 index 0000000..69ffcf3 --- /dev/null +++ b/internal/tagger/jobclasses/highmem.json @@ -0,0 +1,40 @@ +{ + "name": "High memory usage", + "tag": "high_memory_load", + "parameters": [ + "high_memory_load_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": [ + "mem_used" + ], + "requirements": [ + "job.duration > job_min_duration_seconds", + "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds", + "hasattr(job, \"allocated_memory\")" + ], + "terms": [ + { + "memory_alloc": "job.allocated_memory" + }, + { + "memory_used": "mem_used.max('time')" + }, + { + "load_threshold": "memory_alloc * high_memory_load_threshold_factor" + }, + { + "high_mem_nodes": "memory_used > load_threshold" + }, + { + "high_mem": "high_mem_nodes.any('all')" + }, + { + "load_perc": "memory_used / (memory_alloc * high_memory_load_threshold_factor)" + } + ], + "output": "high_mem", + "output_scalar": "load_perc", + "template": "Job ({{ job.jobId }})\nThis job was detected as high_memory_load because the memory usage {{ high_mem_nodes.array }} falls above the threshold {{ load_threshold }}." +} diff --git a/internal/tagger/jobclasses/lowgpuload.json b/internal/tagger/jobclasses/lowgpuload.json new file mode 100644 index 0000000..80339b2 --- /dev/null +++ b/internal/tagger/jobclasses/lowgpuload.json @@ -0,0 +1,36 @@ +{ + "name": "Low GPU load", + "tag": "lowgpuload", + "parameters": [ + "lowgpuload_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": [ + "nv_util" + ], + "requirements": [ + "job.duration > job_min_duration_seconds", + "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + ], + "terms": [ + { + "load_mean": "nv_util.mean('all')" + }, + { + "load_threshold": "job.numAcc * lowgpuload_threshold_factor" + }, + { + "lowload_nodes": "load_mean < load_threshold" + }, + { + "lowload": "lowload_nodes.any('all')" + }, + { + "load_perc": "1.0 - (load_mean / load_threshold)" + } + ], + "output": "lowload", + "output_scalar": "load_perc", + "template": "Job ({{ job.jobId }})\nThis job was detected as lowgpuload because the mean gpu load {{ load_mean }} falls below the threshold {{ load_threshold }}." +} diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json new file mode 100644 index 0000000..e860361 --- /dev/null +++ b/internal/tagger/jobclasses/lowload.json @@ -0,0 +1,38 @@ +{ + "name": "Low CPU load", + "tag": "lowload", + "parameters": [ + "lowcpuload_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": [ + "cpu_load" + ], + "requirements": [ + "job.exclusive == 1", + "job.duration > job_min_duration_seconds", + "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + ], + "tagRule": [ + { + "load_mean": "cpu_load[cpu_load_pre_cutoff_samples:].mean('all')" + }, + { + "load_threshold": "job.numHwthreads * lowcpuload_threshold_factor" + }, + { + "lowload_nodes": "load_mean < load_threshold" + }, + { + "lowload": "lowload_nodes.any('all')" + }, + { + "load_perc": "1.0 - (load_mean / load_threshold)" + } + ], + "valueRule": [], + "output": "lowload", + "output_scalar": "load_perc", + "hint": "Job ({{ job.jobId }})\nThis job was detected as lowload because the mean cpu load {{ load_mean }} falls below the threshold {{ load_threshold }}." +} diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index b336125..d5e42b1 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -13,7 +13,7 @@ import ( type Tagger interface { Register() error - Match(job *schema.Job) + Match(job *schema.JobMeta) } var ( @@ -31,6 +31,8 @@ func Init() { jobTagger = &JobTagger{} jobTagger.startTaggers = make([]Tagger, 0) jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) + jobTagger.stopTaggers = make([]Tagger, 0) + jobTagger.stopTaggers = append(jobTagger.startTaggers, &JobClassTagger{}) for _, tagger := range jobTagger.startTaggers { tagger.Register() @@ -46,5 +48,5 @@ func (jt *JobTagger) JobStartCallback(job *schema.Job) { } } -func (jt *JobTagger) JobStopCallback(job *schema.Job) { +func (jt *JobTagger) JobStopCallback(job *schema.JobMeta) { } diff --git a/pkg/archive/fsBackend.go b/pkg/archive/fsBackend.go index 711b1f5..a59b663 100644 --- a/pkg/archive/fsBackend.go +++ b/pkg/archive/fsBackend.go @@ -59,14 +59,13 @@ func getDirectory( func getPath( job *schema.Job, rootPath string, - file string) string { - + file string, +) string { return filepath.Join( getDirectory(job, rootPath), file) } func loadJobMeta(filename string) (*schema.JobMeta, error) { - b, err := os.ReadFile(filename) if err != nil { log.Errorf("loadJobMeta() > open file error: %v", err) @@ -83,7 +82,6 @@ func loadJobMeta(filename string) (*schema.JobMeta, error) { func loadJobData(filename string, isCompressed bool) (schema.JobData, error) { f, err := os.Open(filename) - if err != nil { log.Errorf("fsBackend LoadJobData()- %v", err) return nil, err @@ -117,7 +115,6 @@ func loadJobData(filename string, isCompressed bool) (schema.JobData, error) { func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, error) { f, err := os.Open(filename) - if err != nil { log.Errorf("fsBackend LoadJobStats()- %v", err) return nil, err @@ -150,7 +147,6 @@ func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, er } func (fsa *FsArchive) Init(rawConfig json.RawMessage) (uint64, error) { - var config FsArchiveConfig if err := json.Unmarshal(rawConfig, &config); err != nil { log.Warnf("Init() > Unmarshal error: %#v", err) @@ -276,7 +272,6 @@ func (fsa *FsArchive) Exists(job *schema.Job) bool { } func (fsa *FsArchive) Clean(before int64, after int64) { - if after == 0 { after = math.MaxInt64 } @@ -392,7 +387,6 @@ func (fsa *FsArchive) Compress(jobs []*schema.Job) { } func (fsa *FsArchive) CompressLast(starttime int64) int64 { - filename := filepath.Join(fsa.path, "compress.txt") b, err := os.ReadFile(filename) if err != nil { @@ -441,7 +435,6 @@ func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) { } func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) { - b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json")) if err != nil { log.Errorf("LoadClusterCfg() > open file error: %v", err) @@ -456,7 +449,6 @@ func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) { } func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { - ch := make(chan JobContainer) go func() { clustersDir, err := os.ReadDir(fsa.path) @@ -527,7 +519,6 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { } func (fsa *FsArchive) StoreJobMeta(jobMeta *schema.JobMeta) error { - job := schema.Job{ BaseJob: jobMeta.BaseJob, StartTime: time.Unix(jobMeta.StartTime, 0), @@ -556,8 +547,8 @@ func (fsa *FsArchive) GetClusters() []string { func (fsa *FsArchive) ImportJob( jobMeta *schema.JobMeta, - jobData *schema.JobData) error { - + jobData *schema.JobData, +) error { job := schema.Job{ BaseJob: jobMeta.BaseJob, StartTime: time.Unix(jobMeta.StartTime, 0), @@ -583,28 +574,6 @@ func (fsa *FsArchive) ImportJob( return err } - // var isCompressed bool = true - // // TODO Use shortJob Config for check - // if jobMeta.Duration < 300 { - // isCompressed = false - // f, err = os.Create(path.Join(dir, "data.json")) - // } else { - // f, err = os.Create(path.Join(dir, "data.json.gz")) - // } - // if err != nil { - // return err - // } - // - // if isCompressed { - // if err := EncodeJobData(gzip.NewWriter(f), jobData); err != nil { - // return err - // } - // } else { - // if err := EncodeJobData(f, jobData); err != nil { - // return err - // } - // } - f, err = os.Create(path.Join(dir, "data.json")) if err != nil { log.Error("Error while creating filepath for data.json") From 733e3ea9d584592d8a78a7244bb2a0a678ffcb06 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 23 May 2025 07:48:27 +0200 Subject: [PATCH 20/45] Revert interface from jobMeta to job type. Extend job classifier tagger. Cleanup test rules. --- internal/archiver/archiveWorker.go | 5 +- internal/repository/jobHooks.go | 4 +- internal/tagger/classifyJob.go | 193 +++++++++++++++++---- internal/tagger/detectApp.go | 6 +- internal/tagger/detectApp_test.go | 8 +- internal/tagger/jobclasses/highload.json | 21 +-- internal/tagger/jobclasses/highmem.json | 40 ----- internal/tagger/jobclasses/lowgpuload.json | 36 ---- internal/tagger/jobclasses/lowload.json | 26 +-- internal/tagger/jobclasses/parameters.json | 14 ++ internal/tagger/tagger.go | 7 +- 11 files changed, 202 insertions(+), 158 deletions(-) delete mode 100644 internal/tagger/jobclasses/highmem.json delete mode 100644 internal/tagger/jobclasses/lowgpuload.json create mode 100644 internal/tagger/jobclasses/parameters.json diff --git a/internal/archiver/archiveWorker.go b/internal/archiver/archiveWorker.go index 6e514cb..42a60b9 100644 --- a/internal/archiver/archiveWorker.go +++ b/internal/archiver/archiveWorker.go @@ -73,10 +73,7 @@ func archivingWorker() { log.Debugf("archiving job %d took %s", job.JobID, time.Since(start)) log.Printf("archiving job (dbid: %d) successful", job.ID) - id := job.ID - jobMeta.ID = &id - - repository.CallJobStopHooks(jobMeta) + repository.CallJobStopHooks(job) archivePending.Done() default: continue diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index 49535f7..1016335 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -12,7 +12,7 @@ import ( type JobHook interface { JobStartCallback(job *schema.Job) - JobStopCallback(job *schema.JobMeta) + JobStopCallback(job *schema.Job) } var ( @@ -44,7 +44,7 @@ func CallJobStartHooks(jobs []*schema.Job) { } } -func CallJobStopHooks(job *schema.JobMeta) { +func CallJobStopHooks(job *schema.Job) { if hooks == nil { return } diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index ec1e843..f7195e3 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -7,14 +7,16 @@ package tagger import ( "bytes" "embed" + "encoding/json" "fmt" - "io/fs" + "maps" "os" - "path/filepath" "strings" + "text/template" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/util" + "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/expr-lang/expr" @@ -24,31 +26,100 @@ import ( //go:embed jobclasses/* var jobclassFiles embed.FS +type Variable struct { + Name string `json:"name"` + Expr string `json:"expr"` +} + +type ruleVariable struct { + name string + expr *vm.Program +} + +type RuleFormat struct { + Name string `json:"name"` + Tag string `json:"tag"` + Parameters []string `json:"parameters"` + Metrics []string `json:"metrics"` + Requirements []string `json:"requirements"` + Variables []Variable `json:"variables"` + Rule string `json:"rule"` + Hint string `json:"hint"` +} + type ruleInfo struct { - tag string - rule *vm.Program + env map[string]any + metrics []string + requirements []*vm.Program + variables []ruleVariable + rule *vm.Program + hint *template.Template } type JobClassTagger struct { - rules map[string]ruleInfo - tagType string - cfgPath string + rules map[string]ruleInfo + parameters map[string]any + tagType string + cfgPath string } -func (t *JobClassTagger) compileRule(f fs.File, fns string) { - buf := new(bytes.Buffer) - _, err := buf.ReadFrom(f) +func (t *JobClassTagger) prepareRule(filename string, fns string) { + b, err := os.ReadFile(filename) if err != nil { - log.Errorf("error reading rule file %s: %#v", fns, err) + log.Warnf("prepareRule() > open file error: %v", err) + return } - prg, err := expr.Compile(buf.String(), expr.AsBool()) + + var rule RuleFormat + if err := json.NewDecoder(bytes.NewReader(b)).Decode(&rule); err != nil { + log.Warn("Error while decoding raw job meta json") + return + } + + ri := ruleInfo{} + ri.env = make(map[string]any) + ri.metrics = make([]string, 0) + ri.requirements = make([]*vm.Program, 0) + ri.variables = make([]ruleVariable, 0) + + // check if all required parameters are available + for _, p := range rule.Parameters { + param, ok := t.parameters[p] + if !ok { + log.Warnf("prepareRule() > missing parameter %s in rule %s", p, fns) + return + } + ri.env[p] = param + } + + // set all required metrics + for _, m := range rule.Metrics { + ri.metrics = append(ri.metrics, m) + } + + // compile requirements + for _, r := range rule.Requirements { + req, err := expr.Compile(r, expr.AsBool()) + if err != nil { + log.Errorf("error compiling requirement %s: %#v", r, err) + return + } + ri.requirements = append(ri.requirements, req) + } + + // compile rule + exp, err := expr.Compile(rule.Rule, expr.AsBool()) if err != nil { log.Errorf("error compiling rule %s: %#v", fns, err) + return } - ri := ruleInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), rule: prg} + ri.rule = exp - delete(t.rules, ri.tag) - t.rules[ri.tag] = ri + // prepare hint template + ri.hint = template.Must(template.New(fns).Parse(rule.Hint)) + + delete(t.rules, rule.Tag) + t.rules[rule.Tag] = ri } func (t *JobClassTagger) EventMatch(s string) bool { @@ -65,11 +136,8 @@ func (t *JobClassTagger) EventCallback() { for _, fn := range files { fns := fn.Name() log.Debugf("Process: %s", fns) - f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns)) - if err != nil { - log.Errorf("error opening app file %s: %#v", fns, err) - } - t.compileRule(f, fns) + filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + t.prepareRule(filename, fns) } } @@ -84,13 +152,23 @@ func (t *JobClassTagger) Register() error { t.rules = make(map[string]ruleInfo, 0) for _, fn := range files { fns := fn.Name() - log.Debugf("Process: %s", fns) - f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) - if err != nil { - return fmt.Errorf("error opening app file %s: %#v", fns, err) + filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + + if fn.Name() == "parameters.json" { + b, err := os.ReadFile(filename) + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return err + } + + if err := json.NewDecoder(bytes.NewReader(b)).Decode(&t.parameters); err != nil { + log.Warn("Error while decoding parameters.json") + return err + } + continue } - defer f.Close() - t.compileRule(f, fns) + log.Debugf("Process: %s", fns) + t.prepareRule(filename, fns) } if util.CheckFileExists(t.cfgPath) { @@ -102,20 +180,69 @@ func (t *JobClassTagger) Register() error { return nil } -func (t *JobClassTagger) Match(job *schema.JobMeta) { +func (t *JobClassTagger) Match(job *schema.Job) { r := repository.GetJobRepository() + jobstats, err := archive.GetStatistics(job) + if err != nil { + log.Errorf("job classification failed for job %d: %#v", job.JobID, err) + return + } - for _, ri := range t.rules { - tag := ri.tag - output, err := expr.Run(ri.rule, job) + for tag, ri := range t.rules { + env := make(map[string]any) + maps.Copy(env, ri.env) + + // add metrics to env + for _, m := range ri.metrics { + stats, ok := jobstats[m] + if !ok { + log.Errorf("job classification failed for job %d: missing metric '%s'", job.JobID, m) + return + } + env[m] = stats.Avg + } + + // check rule requirements apply + for _, r := range ri.requirements { + ok, err := expr.Run(r, env) + if err != nil { + log.Errorf("error running requirement for rule %s: %#v", tag, err) + return + } + if !ok.(bool) { + log.Infof("requirement for rule %s not met", tag) + return + } + } + + // validate rule expression + for _, v := range ri.variables { + value, err := expr.Run(v.expr, env) + if err != nil { + log.Errorf("error running rule %s: %#v", tag, err) + return + } + env[v.name] = value + } + + match, err := expr.Run(ri.rule, job) if err != nil { log.Errorf("error running rule %s: %#v", tag, err) } - if output.(bool) { + if match.(bool) { id := job.ID - if !r.HasTag(*id, t.tagType, tag) { - r.AddTagOrCreateDirect(*id, t.tagType, tag) + if !r.HasTag(id, t.tagType, tag) { + r.AddTagOrCreateDirect(id, t.tagType, tag) } } + + // process hint template + var msg bytes.Buffer + if err := ri.hint.Execute(&msg, env); err != nil { + log.Errorf("Template error: %s", err.Error()) + } + + // FIXME: Handle case where multiple tags apply + r.UpdateMetadata(job, "message", msg.String()) } } diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 8057aad..a37924e 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -95,7 +95,7 @@ func (t *AppTagger) Register() error { return nil } -func (t *AppTagger) Match(job *schema.JobMeta) { +func (t *AppTagger) Match(job *schema.Job) { r := repository.GetJobRepository() jobscript, ok := job.MetaData["jobScript"] if ok { @@ -106,8 +106,8 @@ func (t *AppTagger) Match(job *schema.JobMeta) { tag := a.tag for _, s := range a.strings { if strings.Contains(jobscript, s) { - if !r.HasTag(*id, t.tagType, tag) { - r.AddTagOrCreateDirect(*id, t.tagType, tag) + if !r.HasTag(id, t.tagType, tag) { + r.AddTagOrCreateDirect(id, t.tagType, tag) break out } } diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go index 56bd856..3b43cce 100644 --- a/internal/tagger/detectApp_test.go +++ b/internal/tagger/detectApp_test.go @@ -9,7 +9,6 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/log" - "github.com/ClusterCockpit/cc-backend/pkg/schema" ) func setup(tb testing.TB) *repository.JobRepository { @@ -52,12 +51,7 @@ func TestMatch(t *testing.T) { err = tagger.Register() noErr(t, err) - jobMeta := &schema.JobMeta{ - ID: &job.ID, - BaseJob: job.BaseJob, - StartTime: job.StartTime.Unix(), - } - tagger.Match(jobMeta) + tagger.Match(job) if !r.HasTag(5, "app", "vasp") { t.Errorf("missing tag vasp") diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index a65f400..29d4026 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -12,27 +12,22 @@ ], "requirements": [ "job.exclusive == 1", - "job.duration > job_min_duration_seconds", - "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + "job.duration > job_min_duration_seconds" ], "terms": [ { + "name": "", "load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')" }, { - "load_threshold": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor" + "name": "load_threshold", + "expr": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor" }, { - "highload_nodes": "load_mean > load_threshold" - }, - { - "highload": "highload_nodes.any('all')" - }, - { - "load_perc": "load_mean / load_threshold" + "name": "load_perc", + "expr": "load_mean / load_threshold" } ], - "output": "highload", - "output_scalar": "load_perc", - "template": "Job ({{ job.jobId }})\nThis job was detected as excessiveload because the mean cpu load {{ load_mean.array }} falls above the threshold {{ load_threshold }}." + "rule": "cpu_load > load_threshold", + "hint": "This job was detected as excessiveload because the average cpu load {{ cpu_load }} falls above the threshold {{ load_threshold }}." } diff --git a/internal/tagger/jobclasses/highmem.json b/internal/tagger/jobclasses/highmem.json deleted file mode 100644 index 69ffcf3..0000000 --- a/internal/tagger/jobclasses/highmem.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "name": "High memory usage", - "tag": "high_memory_load", - "parameters": [ - "high_memory_load_threshold_factor", - "job_min_duration_seconds", - "sampling_interval_seconds" - ], - "metrics": [ - "mem_used" - ], - "requirements": [ - "job.duration > job_min_duration_seconds", - "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds", - "hasattr(job, \"allocated_memory\")" - ], - "terms": [ - { - "memory_alloc": "job.allocated_memory" - }, - { - "memory_used": "mem_used.max('time')" - }, - { - "load_threshold": "memory_alloc * high_memory_load_threshold_factor" - }, - { - "high_mem_nodes": "memory_used > load_threshold" - }, - { - "high_mem": "high_mem_nodes.any('all')" - }, - { - "load_perc": "memory_used / (memory_alloc * high_memory_load_threshold_factor)" - } - ], - "output": "high_mem", - "output_scalar": "load_perc", - "template": "Job ({{ job.jobId }})\nThis job was detected as high_memory_load because the memory usage {{ high_mem_nodes.array }} falls above the threshold {{ load_threshold }}." -} diff --git a/internal/tagger/jobclasses/lowgpuload.json b/internal/tagger/jobclasses/lowgpuload.json deleted file mode 100644 index 80339b2..0000000 --- a/internal/tagger/jobclasses/lowgpuload.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "name": "Low GPU load", - "tag": "lowgpuload", - "parameters": [ - "lowgpuload_threshold_factor", - "job_min_duration_seconds", - "sampling_interval_seconds" - ], - "metrics": [ - "nv_util" - ], - "requirements": [ - "job.duration > job_min_duration_seconds", - "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" - ], - "terms": [ - { - "load_mean": "nv_util.mean('all')" - }, - { - "load_threshold": "job.numAcc * lowgpuload_threshold_factor" - }, - { - "lowload_nodes": "load_mean < load_threshold" - }, - { - "lowload": "lowload_nodes.any('all')" - }, - { - "load_perc": "1.0 - (load_mean / load_threshold)" - } - ], - "output": "lowload", - "output_scalar": "load_perc", - "template": "Job ({{ job.jobId }})\nThis job was detected as lowgpuload because the mean gpu load {{ load_mean }} falls below the threshold {{ load_threshold }}." -} diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json index e860361..3c5bd4d 100644 --- a/internal/tagger/jobclasses/lowload.json +++ b/internal/tagger/jobclasses/lowload.json @@ -11,28 +11,18 @@ ], "requirements": [ "job.exclusive == 1", - "job.duration > job_min_duration_seconds", - "required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds" + "job.duration > job_min_duration_seconds" ], - "tagRule": [ + "variables": [ { - "load_mean": "cpu_load[cpu_load_pre_cutoff_samples:].mean('all')" + "name": "load_threshold", + "expr": "job.numHwthreads * lowcpuload_threshold_factor" }, { - "load_threshold": "job.numHwthreads * lowcpuload_threshold_factor" - }, - { - "lowload_nodes": "load_mean < load_threshold" - }, - { - "lowload": "lowload_nodes.any('all')" - }, - { - "load_perc": "1.0 - (load_mean / load_threshold)" + "name": "load_perc", + "expr": "1.0 - (cpu_load / load_threshold)" } ], - "valueRule": [], - "output": "lowload", - "output_scalar": "load_perc", - "hint": "Job ({{ job.jobId }})\nThis job was detected as lowload because the mean cpu load {{ load_mean }} falls below the threshold {{ load_threshold }}." + "rule": "cpu_load < load_threshold", + "hint": "This job was detected as lowload because the average cpu load {{ cpu_load }} falls below the threshold {{ load_threshold }}." } diff --git a/internal/tagger/jobclasses/parameters.json b/internal/tagger/jobclasses/parameters.json new file mode 100644 index 0000000..39e94c1 --- /dev/null +++ b/internal/tagger/jobclasses/parameters.json @@ -0,0 +1,14 @@ +{ + "lowcpuload_threshold_factor": 0.9, + "excessivecpuload_threshold_factor": 1.1, + "highmemoryusage_threshold_factor": 0.9, + "node_load_imbalance_threshold_factor": 0.1, + "core_load_imbalance_threshold_factor": 0.1, + "high_memory_load_threshold_factor": 0.9, + "lowgpuload_threshold_factor": 0.7, + "memory_leak_slope_threshold": 0.1, + "job_min_duration_seconds": 600.0, + "sampling_interval_seconds": 30.0, + "cpu_load_pre_cutoff_samples": 11.0, + "cpu_load_core_pre_cutoff_samples": 6.0 +} diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index d5e42b1..ffdd011 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -13,7 +13,7 @@ import ( type Tagger interface { Register() error - Match(job *schema.JobMeta) + Match(job *schema.Job) } var ( @@ -48,5 +48,8 @@ func (jt *JobTagger) JobStartCallback(job *schema.Job) { } } -func (jt *JobTagger) JobStopCallback(job *schema.JobMeta) { +func (jt *JobTagger) JobStopCallback(job *schema.Job) { + for _, tagger := range jobTagger.stopTaggers { + tagger.Match(job) + } } From 3c66840f953cdba46a23f7c32e71e6186e830489 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 23 May 2025 10:13:59 +0200 Subject: [PATCH 21/45] Add tagger config option and command line switch to run taggers on all jobs --- cmd/cc-backend/cli.go | 6 ++-- cmd/cc-backend/main.go | 11 ++++++- internal/repository/job.go | 1 + internal/repository/jobFind.go | 29 +++++++++++++++++++ internal/tagger/tagger.go | 53 ++++++++++++++++++++++++++-------- pkg/schema/config.go | 2 ++ 6 files changed, 87 insertions(+), 15 deletions(-) diff --git a/cmd/cc-backend/cli.go b/cmd/cc-backend/cli.go index 8d9e7e6..8b826bb 100644 --- a/cmd/cc-backend/cli.go +++ b/cmd/cc-backend/cli.go @@ -7,8 +7,9 @@ package main import "flag" var ( - flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool - flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string + flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, + flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags bool + flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string ) func cliInit() { @@ -21,6 +22,7 @@ func cliInit() { flag.BoolVar(&flagVersion, "version", false, "Show version information and exit") flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit") flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit") + flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit") flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit") flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages") flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`") diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index cbfccef..cd2d08d 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -213,12 +213,21 @@ func main() { } } + if flagApplyTags { + if err := tagger.RunTaggers(); err != nil { + log.Abortf("Running job taggers.\nError: %s\n", err.Error()) + } + } + if !flagServer { log.Exit("No errors, server flag not set. Exiting cc-backend.") } archiver.Start(repository.GetJobRepository()) - tagger.Init() + + if config.Keys.EnableJobTaggers { + tagger.Init() + } taskManager.Start() serverInit() diff --git a/internal/repository/job.go b/internal/repository/job.go index 73a2588..97ca280 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -472,6 +472,7 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { return nil } +// FIXME: Reconsider filtering short jobs with harcoded threshold func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { query := sq.Select(jobColumns...).From("job"). Where(fmt.Sprintf("job.cluster = '%s'", cluster)). diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index ac09355..614b7c0 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -103,6 +103,35 @@ func (r *JobRepository) FindAll( return jobs, nil } +// Get complete joblist only consisting of db ids. +// This is useful to process large job counts and intended to be used +// together with FindById to process jobs one by one +func (r *JobRepository) GetJobList() ([]int64, error) { + query := sq.Select("id").From("job"). + Where("job.job_state != 'running'") + + rows, err := query.RunWith(r.stmtCache).Query() + if err != nil { + log.Error("Error while running query") + return nil, err + } + + jl := make([]int64, 0, 1000) + for rows.Next() { + var id int64 + err := rows.Scan(&id) + if err != nil { + rows.Close() + log.Warn("Error while scanning rows") + return nil, err + } + jl = append(jl, id) + } + + log.Infof("Return job count %d", len(jl)) + return jl, nil +} + // FindById executes a SQL query to find a specific batch job. // The job is queried using the database id. // It returns a pointer to a schema.Job data structure and an error variable. diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index ffdd011..da32fc4 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -8,6 +8,7 @@ import ( "sync" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" ) @@ -26,30 +27,58 @@ type JobTagger struct { stopTaggers []Tagger } +func newTagger() { + jobTagger = &JobTagger{} + jobTagger.startTaggers = make([]Tagger, 0) + jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) + jobTagger.stopTaggers = make([]Tagger, 0) + jobTagger.stopTaggers = append(jobTagger.startTaggers, &JobClassTagger{}) + + for _, tagger := range jobTagger.startTaggers { + tagger.Register() + } +} + func Init() { initOnce.Do(func() { - jobTagger = &JobTagger{} - jobTagger.startTaggers = make([]Tagger, 0) - jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) - jobTagger.stopTaggers = make([]Tagger, 0) - jobTagger.stopTaggers = append(jobTagger.startTaggers, &JobClassTagger{}) - - for _, tagger := range jobTagger.startTaggers { - tagger.Register() - } - + newTagger() repository.RegisterJobJook(jobTagger) }) } func (jt *JobTagger) JobStartCallback(job *schema.Job) { - for _, tagger := range jobTagger.startTaggers { + for _, tagger := range jt.startTaggers { tagger.Match(job) } } func (jt *JobTagger) JobStopCallback(job *schema.Job) { - for _, tagger := range jobTagger.stopTaggers { + for _, tagger := range jt.stopTaggers { tagger.Match(job) } } + +func RunTaggers() error { + newTagger() + r := repository.GetJobRepository() + jl, err := r.GetJobList() + if err != nil { + log.Errorf("Error while getting job list %s", err) + return err + } + + for _, id := range jl { + job, err := r.FindByIdDirect(id) + if err != nil { + log.Errorf("Error while getting job %s", err) + return err + } + for _, tagger := range jobTagger.startTaggers { + tagger.Match(job) + } + for _, tagger := range jobTagger.stopTaggers { + tagger.Match(job) + } + } + return nil +} diff --git a/pkg/schema/config.go b/pkg/schema/config.go index a5caa61..eda3d91 100644 --- a/pkg/schema/config.go +++ b/pkg/schema/config.go @@ -131,6 +131,8 @@ type ProgramConfig struct { // do not write to the job-archive. DisableArchive bool `json:"disable-archive"` + EnableJobTaggers bool `json:"enable-job-taggers"` + // Validate json input against schema Validate bool `json:"validate"` From f14bdb306845ba8304e8d79418a885ee19983b71 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 13:08:03 +0200 Subject: [PATCH 22/45] Fix bugs in job classifier and tagger infrastructure --- cmd/cc-backend/main.go | 7 +- go.mod | 3 + go.sum | 6 ++ internal/tagger/classifyJob.go | 96 ++++++++++++++++++------ internal/tagger/detectApp.go | 10 ++- internal/tagger/jobclasses/highload.json | 16 ++-- internal/tagger/jobclasses/lowload.json | 8 +- internal/tagger/tagger.go | 6 +- 8 files changed, 105 insertions(+), 47 deletions(-) diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index cd2d08d..ab07d28 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -213,6 +213,10 @@ func main() { } } + if config.Keys.EnableJobTaggers { + tagger.Init() + } + if flagApplyTags { if err := tagger.RunTaggers(); err != nil { log.Abortf("Running job taggers.\nError: %s\n", err.Error()) @@ -225,9 +229,6 @@ func main() { archiver.Start(repository.GetJobRepository()) - if config.Keys.EnableJobTaggers { - tagger.Init() - } taskManager.Start() serverInit() diff --git a/go.mod b/go.mod index 062ee3e..c57d9ed 100644 --- a/go.mod +++ b/go.mod @@ -54,6 +54,8 @@ require ( github.com/go-openapi/swag v0.23.0 // indirect github.com/go-viper/mapstructure/v2 v2.2.1 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/gookit/color v1.5.4 // indirect + github.com/gookit/goutil v0.6.18 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect @@ -80,6 +82,7 @@ require ( github.com/sosodev/duration v1.3.1 // indirect github.com/swaggo/files v1.0.1 // indirect github.com/urfave/cli/v2 v2.27.5 // indirect + github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect go.uber.org/atomic v1.11.0 // indirect golang.org/x/mod v0.23.0 // indirect diff --git a/go.sum b/go.sum index b4c3781..2102888 100644 --- a/go.sum +++ b/go.sum @@ -101,6 +101,10 @@ github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark= github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gookit/color v1.5.4 h1:FZmqs7XOyGgCAxmWyPslpiok1k05wmY3SJTytgvYFs0= +github.com/gookit/color v1.5.4/go.mod h1:pZJOeOS8DM43rXbp4AZo1n9zCU2qjpcRko0b6/QJi9w= +github.com/gookit/goutil v0.6.18 h1:MUVj0G16flubWT8zYVicIuisUiHdgirPAkmnfD2kKgw= +github.com/gookit/goutil v0.6.18/go.mod h1:AY/5sAwKe7Xck+mEbuxj0n/bc3qwrGNe3Oeulln7zBA= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= @@ -241,6 +245,8 @@ github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w= github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ= github.com/vektah/gqlparser/v2 v2.5.22 h1:yaaeJ0fu+nv1vUMW0Hl+aS1eiv1vMfapBNjpffAda1I= github.com/vektah/gqlparser/v2 v2.5.22/go.mod h1:xMl+ta8a5M1Yo1A1Iwt/k7gSpscwSnHZdw7tfhEGfTM= +github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= +github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index f7195e3..bf86894 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -21,6 +21,7 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/expr-lang/expr" "github.com/expr-lang/expr/vm" + "github.com/gookit/goutil/dump" ) //go:embed jobclasses/* @@ -63,13 +64,7 @@ type JobClassTagger struct { cfgPath string } -func (t *JobClassTagger) prepareRule(filename string, fns string) { - b, err := os.ReadFile(filename) - if err != nil { - log.Warnf("prepareRule() > open file error: %v", err) - return - } - +func (t *JobClassTagger) prepareRule(b []byte, fns string) { var rule RuleFormat if err := json.NewDecoder(bytes.NewReader(b)).Decode(&rule); err != nil { log.Warn("Error while decoding raw job meta json") @@ -93,9 +88,7 @@ func (t *JobClassTagger) prepareRule(filename string, fns string) { } // set all required metrics - for _, m := range rule.Metrics { - ri.metrics = append(ri.metrics, m) - } + ri.metrics = append(ri.metrics, rule.Metrics...) // compile requirements for _, r := range rule.Requirements { @@ -107,6 +100,16 @@ func (t *JobClassTagger) prepareRule(filename string, fns string) { ri.requirements = append(ri.requirements, req) } + // compile variables + for _, v := range rule.Variables { + req, err := expr.Compile(v.Expr, expr.AsFloat64()) + if err != nil { + log.Errorf("error compiling requirement %s: %#v", v.Name, err) + return + } + ri.variables = append(ri.variables, ruleVariable{name: v.Name, expr: req}) + } + // compile rule exp, err := expr.Compile(rule.Rule, expr.AsBool()) if err != nil { @@ -116,7 +119,11 @@ func (t *JobClassTagger) prepareRule(filename string, fns string) { ri.rule = exp // prepare hint template - ri.hint = template.Must(template.New(fns).Parse(rule.Hint)) + ri.hint, err = template.New(fns).Parse(rule.Hint) + if err != nil { + log.Errorf("error processing template %s: %#v", fns, err) + } + log.Infof("prepareRule() > processing %s with %d requirements and %d variables", fns, len(ri.requirements), len(ri.variables)) delete(t.rules, rule.Tag) t.rules[rule.Tag] = ri @@ -137,38 +144,59 @@ func (t *JobClassTagger) EventCallback() { fns := fn.Name() log.Debugf("Process: %s", fns) filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) - t.prepareRule(filename, fns) + b, err := os.ReadFile(filename) + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return + } + t.prepareRule(b, fns) } } +func (t *JobClassTagger) initParameters() error { + log.Info("Initialize parameters") + b, err := jobclassFiles.ReadFile("jobclasses/parameters.json") + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return err + } + + if err := json.NewDecoder(bytes.NewReader(b)).Decode(&t.parameters); err != nil { + log.Warn("Error while decoding parameters.json") + return err + } + + return nil +} + func (t *JobClassTagger) Register() error { t.cfgPath = "./var/tagger/jobclasses" t.tagType = "jobClass" - files, err := appFiles.ReadDir("jobclasses") + err := t.initParameters() + if err != nil { + log.Warnf("error reading parameters.json: %v", err) + return err + } + + files, err := jobclassFiles.ReadDir("jobclasses") if err != nil { return fmt.Errorf("error reading app folder: %#v", err) } t.rules = make(map[string]ruleInfo, 0) for _, fn := range files { fns := fn.Name() - filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + if fns != "parameters.json" { + filename := fmt.Sprintf("jobclasses/%s", fns) + log.Infof("Process: %s", fns) - if fn.Name() == "parameters.json" { - b, err := os.ReadFile(filename) + b, err := jobclassFiles.ReadFile(filename) if err != nil { log.Warnf("prepareRule() > open file error: %v", err) return err } - - if err := json.NewDecoder(bytes.NewReader(b)).Decode(&t.parameters); err != nil { - log.Warn("Error while decoding parameters.json") - return err - } - continue + t.prepareRule(b, fns) } - log.Debugf("Process: %s", fns) - t.prepareRule(filename, fns) } if util.CheckFileExists(t.cfgPath) { @@ -183,6 +211,7 @@ func (t *JobClassTagger) Register() error { func (t *JobClassTagger) Match(job *schema.Job) { r := repository.GetJobRepository() jobstats, err := archive.GetStatistics(job) + log.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID) if err != nil { log.Errorf("job classification failed for job %d: %#v", job.JobID, err) return @@ -191,6 +220,16 @@ func (t *JobClassTagger) Match(job *schema.Job) { for tag, ri := range t.rules { env := make(map[string]any) maps.Copy(env, ri.env) + log.Infof("Try to match rule %s for job %d", tag, job.JobID) + env["job"] = map[string]any{ + "exclusive": job.Exclusive, + "duration": job.Duration, + "numCores": job.NumHWThreads, + "numNodes": job.NumNodes, + "jobState": job.State, + "numAcc": job.NumAcc, + "smt": job.SMT, + } // add metrics to env for _, m := range ri.metrics { @@ -225,21 +264,28 @@ func (t *JobClassTagger) Match(job *schema.Job) { env[v.name] = value } - match, err := expr.Run(ri.rule, job) + dump.P(env) + + match, err := expr.Run(ri.rule, env) if err != nil { log.Errorf("error running rule %s: %#v", tag, err) + return } if match.(bool) { + log.Info("Rule matches!") id := job.ID if !r.HasTag(id, t.tagType, tag) { r.AddTagOrCreateDirect(id, t.tagType, tag) } + } else { + log.Info("Rule does not match!") } // process hint template var msg bytes.Buffer if err := ri.hint.Execute(&msg, env); err != nil { log.Errorf("Template error: %s", err.Error()) + return } // FIXME: Handle case where multiple tags apply diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index a37924e..7945b48 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -79,10 +79,10 @@ func (t *AppTagger) Register() error { fns := fn.Name() log.Debugf("Process: %s", fns) f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) - defer f.Close() if err != nil { return fmt.Errorf("error opening app file %s: %#v", fns, err) } + defer f.Close() t.scanApp(f, fns) } @@ -97,7 +97,13 @@ func (t *AppTagger) Register() error { func (t *AppTagger) Match(job *schema.Job) { r := repository.GetJobRepository() - jobscript, ok := job.MetaData["jobScript"] + metadata, err := r.FetchMetadata(job) + if err != nil { + log.Infof("Cannot fetch metadata for job: %d on %s", job.JobID, job.Cluster) + return + } + + jobscript, ok := metadata["jobScript"] if ok { id := job.ID diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index 29d4026..2715ee8 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -7,27 +7,21 @@ "job_min_duration_seconds", "sampling_interval_seconds" ], - "metrics": [ - "cpu_load" - ], + "metrics": ["cpu_load"], "requirements": [ "job.exclusive == 1", "job.duration > job_min_duration_seconds" ], - "terms": [ - { - "name": "", - "load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')" - }, + "variables": [ { "name": "load_threshold", - "expr": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor" + "expr": "(job.numCores / job.numNodes) * excessivecpuload_threshold_factor" }, { "name": "load_perc", - "expr": "load_mean / load_threshold" + "expr": "cpu_load / load_threshold" } ], "rule": "cpu_load > load_threshold", - "hint": "This job was detected as excessiveload because the average cpu load {{ cpu_load }} falls above the threshold {{ load_threshold }}." + "hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load}} falls above the threshold {{.load_threshold}}." } diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json index 3c5bd4d..4c21a6b 100644 --- a/internal/tagger/jobclasses/lowload.json +++ b/internal/tagger/jobclasses/lowload.json @@ -6,9 +6,7 @@ "job_min_duration_seconds", "sampling_interval_seconds" ], - "metrics": [ - "cpu_load" - ], + "metrics": ["cpu_load"], "requirements": [ "job.exclusive == 1", "job.duration > job_min_duration_seconds" @@ -16,7 +14,7 @@ "variables": [ { "name": "load_threshold", - "expr": "job.numHwthreads * lowcpuload_threshold_factor" + "expr": "job.numCores * lowcpuload_threshold_factor" }, { "name": "load_perc", @@ -24,5 +22,5 @@ } ], "rule": "cpu_load < load_threshold", - "hint": "This job was detected as lowload because the average cpu load {{ cpu_load }} falls below the threshold {{ load_threshold }}." + "hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.load_threshold}}." } diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index da32fc4..04edd49 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -32,11 +32,14 @@ func newTagger() { jobTagger.startTaggers = make([]Tagger, 0) jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) jobTagger.stopTaggers = make([]Tagger, 0) - jobTagger.stopTaggers = append(jobTagger.startTaggers, &JobClassTagger{}) + jobTagger.stopTaggers = append(jobTagger.stopTaggers, &JobClassTagger{}) for _, tagger := range jobTagger.startTaggers { tagger.Register() } + for _, tagger := range jobTagger.stopTaggers { + tagger.Register() + } } func Init() { @@ -77,6 +80,7 @@ func RunTaggers() error { tagger.Match(job) } for _, tagger := range jobTagger.stopTaggers { + log.Infof("Run stop tagger for job %d", job.ID) tagger.Match(job) } } From 8d6ae85b0d76b2e25198a4e474b608345d33e34a Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 13:26:18 +0200 Subject: [PATCH 23/45] Fix bug with job columns --- internal/repository/job.go | 29 +++++++++++++++++------------ internal/repository/jobCreate.go | 2 +- internal/repository/jobFind.go | 2 +- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/internal/repository/job.go b/internal/repository/job.go index 97ca280..c6c566e 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -51,25 +51,30 @@ func GetJobRepository() *JobRepository { return jobRepoInstance } -// var jobColumns []string = []string{ -// "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", -// "job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", -// "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", -// } - var jobColumns []string = []string{ - "id", "job_id", "hpc_user", "project", "cluster", "subcluster", "start_time", - "cluster_partition", "array_job_id", "num_nodes", "num_hwthreads", "num_acc", - "exclusive", "monitoring_status", "smt", "job_state", "duration", "walltime", - "resources", "footprint", "energy", + "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", + "job.start_time", "job.cluster_partition", "job.array_job_id", "job.num_nodes", + "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", + "job.smt", "job.job_state", "job.duration", "job.walltime", "job.resources", + "job.footprint", "job.energy", +} + +var jobCacheColumns []string = []string{ + "jobcache.id", "jobcache.job_id", "jobcache.hpc_user", "jobcache.project", "jobcache.cluster", + "jobcache.subcluster", "jobcache.start_time", "jobcache.cluster_partition", + "jobcache.array_job_id", "jobcache.num_nodes", "jobcache.num_hwthreads", + "jobcache.num_acc", "jobcache.exclusive", "jobcache.monitoring_status", "jobcache.smt", + "jobcache.job_state", "jobcache.duration", "jobcache.walltime", "jobcache.resources", + "jobcache.footprint", "jobcache.energy", } func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { job := &schema.Job{} if err := row.Scan( - &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, - &job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, + &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, + &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, &job.NumNodes, &job.NumHWThreads, + &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, &job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint, &job.Energy); err != nil { log.Warnf("Error while scanning rows (Job): %v", err) return nil, err diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index a651db9..f286c68 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -50,7 +50,7 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { r.Mutex.Lock() defer r.Mutex.Unlock() - query := sq.Select(jobColumns...).From("job_cache") + query := sq.Select(jobCacheColumns...).From("job_cache") rows, err := query.RunWith(r.stmtCache).Query() if err != nil { diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index 614b7c0..b820084 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -48,7 +48,7 @@ func (r *JobRepository) FindCached( cluster *string, startTime *int64, ) (*schema.Job, error) { - q := sq.Select(jobColumns...).From("job_cache"). + q := sq.Select(jobCacheColumns...).From("job_cache"). Where("job_cache.job_id = ?", *jobId) if cluster != nil { From 0261c263f96e50196ce87db0f01c848d111caf87 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 13:36:23 +0200 Subject: [PATCH 24/45] Add hint message only if rule matches --- internal/tagger/classifyJob.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index bf86894..9c4f7cb 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -277,18 +277,18 @@ func (t *JobClassTagger) Match(job *schema.Job) { if !r.HasTag(id, t.tagType, tag) { r.AddTagOrCreateDirect(id, t.tagType, tag) } + + // process hint template + var msg bytes.Buffer + if err := ri.hint.Execute(&msg, env); err != nil { + log.Errorf("Template error: %s", err.Error()) + return + } + + // FIXME: Handle case where multiple tags apply + r.UpdateMetadata(job, "message", msg.String()) } else { log.Info("Rule does not match!") } - - // process hint template - var msg bytes.Buffer - if err := ri.hint.Execute(&msg, env); err != nil { - log.Errorf("Template error: %s", err.Error()) - return - } - - // FIXME: Handle case where multiple tags apply - r.UpdateMetadata(job, "message", msg.String()) } } From 1e7fbe5d561263b735e13b9ee43d35388e9abad1 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 13:40:34 +0200 Subject: [PATCH 25/45] Refactor --- internal/repository/jobQuery.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go index 6a2ddec..2f72e77 100644 --- a/internal/repository/jobQuery.go +++ b/internal/repository/jobQuery.go @@ -148,9 +148,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select } if filter.DbID != nil { dbIDs := make([]string, len(filter.DbID)) - for i, val := range filter.DbID { - dbIDs[i] = val - } + copy(dbIDs, filter.DbID) query = query.Where(sq.Eq{"job.id": dbIDs}) } if filter.JobID != nil { From 9b325041c14040c5c79d3e5fe6ec0985c4bb40ba Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 14:30:30 +0200 Subject: [PATCH 26/45] Fix typo in jobCache columns --- internal/repository/job.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/repository/job.go b/internal/repository/job.go index c6c566e..3702099 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -60,12 +60,12 @@ var jobColumns []string = []string{ } var jobCacheColumns []string = []string{ - "jobcache.id", "jobcache.job_id", "jobcache.hpc_user", "jobcache.project", "jobcache.cluster", - "jobcache.subcluster", "jobcache.start_time", "jobcache.cluster_partition", - "jobcache.array_job_id", "jobcache.num_nodes", "jobcache.num_hwthreads", - "jobcache.num_acc", "jobcache.exclusive", "jobcache.monitoring_status", "jobcache.smt", - "jobcache.job_state", "jobcache.duration", "jobcache.walltime", "jobcache.resources", - "jobcache.footprint", "jobcache.energy", + "job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.cluster", + "job_cache.subcluster", "job_cache.start_time", "job_cache.cluster_partition", + "job_cache.array_job_id", "job_cache.num_nodes", "job_cache.num_hwthreads", + "job_cache.num_acc", "job_cache.exclusive", "job_cache.monitoring_status", "job_cache.smt", + "job_cache.job_state", "job_cache.duration", "job_cache.walltime", "job_cache.resources", + "job_cache.footprint", "job_cache.energy", } func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { From 80032170923162edffa9a5e86609a7de38c384cd Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 14:41:02 +0200 Subject: [PATCH 27/45] Add string to gromacs app file --- internal/tagger/apps/gromacs.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/tagger/apps/gromacs.txt b/internal/tagger/apps/gromacs.txt index d8c0829..c5d939b 100644 --- a/internal/tagger/apps/gromacs.txt +++ b/internal/tagger/apps/gromacs.txt @@ -1,3 +1,4 @@ GROMACS gromacs GMX +mdrun From 5a88c77171a7c41a46c3b12577161462139cab64 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 26 May 2025 14:42:41 +0200 Subject: [PATCH 28/45] Remove debug output --- go.mod | 9 +++------ go.sum | 6 ------ internal/tagger/classifyJob.go | 3 +-- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/go.mod b/go.mod index c57d9ed..6c92171 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,8 @@ require ( github.com/ClusterCockpit/cc-units v0.4.0 github.com/Masterminds/squirrel v1.5.4 github.com/coreos/go-oidc/v3 v3.12.0 + github.com/expr-lang/expr v1.17.3 + github.com/fsnotify/fsnotify v1.9.0 github.com/go-co-op/gocron/v2 v2.16.0 github.com/go-ldap/ldap/v3 v3.4.10 github.com/go-sql-driver/mysql v1.9.0 @@ -20,6 +22,7 @@ require ( github.com/gorilla/sessions v1.4.0 github.com/influxdata/influxdb-client-go/v2 v2.14.0 github.com/jmoiron/sqlx v1.4.0 + github.com/joho/godotenv v1.5.1 github.com/mattn/go-sqlite3 v1.14.24 github.com/prometheus/client_golang v1.21.0 github.com/prometheus/common v0.62.0 @@ -43,9 +46,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect - github.com/expr-lang/expr v1.17.3 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect github.com/go-jose/go-jose/v4 v4.0.5 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect @@ -54,15 +55,12 @@ require ( github.com/go-openapi/swag v0.23.0 // indirect github.com/go-viper/mapstructure/v2 v2.2.1 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/gookit/color v1.5.4 // indirect - github.com/gookit/goutil v0.6.18 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect - github.com/joho/godotenv v1.5.1 // indirect github.com/jonboulle/clockwork v0.5.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect @@ -82,7 +80,6 @@ require ( github.com/sosodev/duration v1.3.1 // indirect github.com/swaggo/files v1.0.1 // indirect github.com/urfave/cli/v2 v2.27.5 // indirect - github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect go.uber.org/atomic v1.11.0 // indirect golang.org/x/mod v0.23.0 // indirect diff --git a/go.sum b/go.sum index 2102888..b4c3781 100644 --- a/go.sum +++ b/go.sum @@ -101,10 +101,6 @@ github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark= github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gookit/color v1.5.4 h1:FZmqs7XOyGgCAxmWyPslpiok1k05wmY3SJTytgvYFs0= -github.com/gookit/color v1.5.4/go.mod h1:pZJOeOS8DM43rXbp4AZo1n9zCU2qjpcRko0b6/QJi9w= -github.com/gookit/goutil v0.6.18 h1:MUVj0G16flubWT8zYVicIuisUiHdgirPAkmnfD2kKgw= -github.com/gookit/goutil v0.6.18/go.mod h1:AY/5sAwKe7Xck+mEbuxj0n/bc3qwrGNe3Oeulln7zBA= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= @@ -245,8 +241,6 @@ github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w= github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ= github.com/vektah/gqlparser/v2 v2.5.22 h1:yaaeJ0fu+nv1vUMW0Hl+aS1eiv1vMfapBNjpffAda1I= github.com/vektah/gqlparser/v2 v2.5.22/go.mod h1:xMl+ta8a5M1Yo1A1Iwt/k7gSpscwSnHZdw7tfhEGfTM= -github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= -github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index 9c4f7cb..16afe63 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -21,7 +21,6 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/expr-lang/expr" "github.com/expr-lang/expr/vm" - "github.com/gookit/goutil/dump" ) //go:embed jobclasses/* @@ -264,7 +263,7 @@ func (t *JobClassTagger) Match(job *schema.Job) { env[v.name] = value } - dump.P(env) + // dump.P(env) match, err := expr.Run(ri.rule, env) if err != nil { From 0aecea6de21932d99a2d7632aa0b01a1525472a4 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 27 May 2025 09:23:28 +0200 Subject: [PATCH 29/45] Refactor. Add Subcluster get metric list helper routine. --- internal/tagger/classifyJob.go | 34 +++++++++++++++++----- pkg/archive/clusterConfig.go | 53 ++++++++++++++++++++++++++++++---- pkg/archive/nodelist.go | 9 +++--- pkg/schema/cluster.go | 32 ++++++++++---------- 4 files changed, 94 insertions(+), 34 deletions(-) diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index 16afe63..0af6738 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -124,7 +124,6 @@ func (t *JobClassTagger) prepareRule(b []byte, fns string) { } log.Infof("prepareRule() > processing %s with %d requirements and %d variables", fns, len(ri.requirements), len(ri.variables)) - delete(t.rules, rule.Tag) t.rules[rule.Tag] = ri } @@ -139,16 +138,33 @@ func (t *JobClassTagger) EventCallback() { log.Fatal(err) } - for _, fn := range files { - fns := fn.Name() - log.Debugf("Process: %s", fns) - filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) - b, err := os.ReadFile(filename) + if util.CheckFileExists(t.cfgPath + "/parameters.json") { + log.Info("Merge parameters") + b, err := os.ReadFile(t.cfgPath + "/parameters.json") if err != nil { log.Warnf("prepareRule() > open file error: %v", err) - return } - t.prepareRule(b, fns) + + var paramTmp map[string]any + if err := json.NewDecoder(bytes.NewReader(b)).Decode(¶mTmp); err != nil { + log.Warn("Error while decoding parameters.json") + } + + maps.Copy(t.parameters, paramTmp) + } + + for _, fn := range files { + fns := fn.Name() + if fns != "parameters.json" { + log.Debugf("Process: %s", fns) + filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + b, err := os.ReadFile(filename) + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return + } + t.prepareRule(b, fns) + } } } @@ -220,6 +236,8 @@ func (t *JobClassTagger) Match(job *schema.Job) { env := make(map[string]any) maps.Copy(env, ri.env) log.Infof("Try to match rule %s for job %d", tag, job.JobID) + + // Initialize environment env["job"] = map[string]any{ "exclusive": job.Exclusive, "duration": job.Duration, diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index d53941b..95520a0 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -69,16 +69,18 @@ func initClusterConfig() error { for _, sc := range cluster.SubClusters { newMetric := &schema.MetricConfig{ - Unit: mc.Unit, + Metric: schema.Metric{ + Name: mc.Name, + Unit: mc.Unit, + Peak: mc.Peak, + Normal: mc.Normal, + Caution: mc.Caution, + Alert: mc.Alert, + }, Energy: mc.Energy, - Name: mc.Name, Scope: mc.Scope, Aggregation: mc.Aggregation, - Peak: mc.Peak, - Caution: mc.Caution, - Alert: mc.Alert, Timestep: mc.Timestep, - Normal: mc.Normal, LowerIsBetter: mc.LowerIsBetter, } @@ -167,6 +169,45 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) { return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster) } +func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Metric { + metrics := make(map[string]*schema.Metric) + + for _, c := range Clusters { + if c.Name == cluster { + for _, m := range c.MetricConfig { + for _, s := range m.SubClusters { + if s.Name == subcluster { + metrics[m.Name] = &schema.Metric{ + Name: m.Name, + Unit: s.Unit, + Peak: s.Peak, + Normal: s.Normal, + Caution: s.Caution, + Alert: s.Alert, + } + break + } + } + + _, ok := metrics[m.Name] + if !ok { + metrics[m.Name] = &schema.Metric{ + Name: m.Name, + Unit: m.Unit, + Peak: m.Peak, + Normal: m.Normal, + Caution: m.Caution, + Alert: m.Alert, + } + } + } + break + } + } + + return metrics +} + func GetMetricConfig(cluster, metric string) *schema.MetricConfig { for _, c := range Clusters { if c.Name == cluster { diff --git a/pkg/archive/nodelist.go b/pkg/archive/nodelist.go index 7700185..26a15d2 100644 --- a/pkg/archive/nodelist.go +++ b/pkg/archive/nodelist.go @@ -61,7 +61,7 @@ func (nl *NodeList) PrintList() []string { } func (nl *NodeList) NodeCount() int { - var out int = 0 + out := 0 for _, term := range *nl { if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one out += 1 @@ -160,7 +160,7 @@ func (nle NLExprIntRange) limits() []map[string]int { m["start"] = int(nle.start) m["end"] = int(nle.end) m["digits"] = int(nle.digits) - if nle.zeroPadded == true { + if nle.zeroPadded { m["zeroPadded"] = 1 } else { m["zeroPadded"] = 0 @@ -183,14 +183,15 @@ func ParseNodeList(raw string) (NodeList, error) { rawterms := []string{} prevterm := 0 for i := 0; i < len(raw); i++ { - if raw[i] == '[' { + switch raw[i] { + case '[': for i < len(raw) && raw[i] != ']' { i++ } if i == len(raw) { return nil, fmt.Errorf("ARCHIVE/NODELIST > unclosed '['") } - } else if raw[i] == ',' { + case ',': rawterms = append(rawterms, raw[prevterm:i]) prevterm = i + 1 } diff --git a/pkg/schema/cluster.go b/pkg/schema/cluster.go index 322f308..1b9f2cc 100644 --- a/pkg/schema/cluster.go +++ b/pkg/schema/cluster.go @@ -45,31 +45,31 @@ type SubCluster struct { ThreadsPerCore int `json:"threadsPerCore"` } +type Metric struct { + Name string `json:"name"` + Unit Unit `json:"unit"` + Peak float64 `json:"peak"` + Normal float64 `json:"normal"` + Caution float64 `json:"caution"` + Alert float64 `json:"alert"` +} + type SubClusterConfig struct { - Name string `json:"name"` - Footprint string `json:"footprint,omitempty"` - Energy string `json:"energy"` - Peak float64 `json:"peak"` - Normal float64 `json:"normal"` - Caution float64 `json:"caution"` - Alert float64 `json:"alert"` - Remove bool `json:"remove"` - LowerIsBetter bool `json:"lowerIsBetter"` + Metric + Footprint string `json:"footprint,omitempty"` + Energy string `json:"energy"` + Remove bool `json:"remove"` + LowerIsBetter bool `json:"lowerIsBetter"` } type MetricConfig struct { - Unit Unit `json:"unit"` + Metric Energy string `json:"energy"` - Name string `json:"name"` Scope MetricScope `json:"scope"` Aggregation string `json:"aggregation"` Footprint string `json:"footprint,omitempty"` SubClusters []*SubClusterConfig `json:"subClusters,omitempty"` - Peak float64 `json:"peak"` - Caution float64 `json:"caution"` - Alert float64 `json:"alert"` Timestep int `json:"timestep"` - Normal float64 `json:"normal"` LowerIsBetter bool `json:"lowerIsBetter"` } @@ -127,7 +127,7 @@ func (topo *Topology) GetSocketsFromHWThreads( // those in the argument list are assigned to one of the sockets in the first // return value, return true as the second value. TODO: Optimize this, there // must be a more efficient way/algorithm. -func (topo *Topology) GetSocketsFromCores ( +func (topo *Topology) GetSocketsFromCores( cores []int, ) (sockets []int, exclusive bool) { socketsMap := map[int]int{} From cdfe7224576a7db0b798327b5f145a8169ae2eb2 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 27 May 2025 13:02:13 +0200 Subject: [PATCH 30/45] Include metric thresholds in rule environment Not yet tested --- internal/tagger/classifyJob.go | 13 ++++++++++++- internal/tagger/jobclasses/highload.json | 2 +- internal/tagger/jobclasses/lowload.json | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index 0af6738..6fd3fae 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -226,6 +226,7 @@ func (t *JobClassTagger) Register() error { func (t *JobClassTagger) Match(job *schema.Job) { r := repository.GetJobRepository() jobstats, err := archive.GetStatistics(job) + metricsList := archive.GetMetricConfigSubCluster(job.Cluster, job.SubCluster) log.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID) if err != nil { log.Errorf("job classification failed for job %d: %#v", job.JobID, err) @@ -255,7 +256,17 @@ func (t *JobClassTagger) Match(job *schema.Job) { log.Errorf("job classification failed for job %d: missing metric '%s'", job.JobID, m) return } - env[m] = stats.Avg + env[m] = map[string]any{ + "min": stats.Min, + "max": stats.Max, + "avg": stats.Avg, + "limits": map[string]float64{ + "peak": metricsList[m].Peak, + "normal": metricsList[m].Normal, + "caution": metricsList[m].Caution, + "alert": metricsList[m].Alert, + }, + } } // check rule requirements apply diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index 2715ee8..444ca4d 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -19,7 +19,7 @@ }, { "name": "load_perc", - "expr": "cpu_load / load_threshold" + "expr": "cpu_load.avg / load_threshold" } ], "rule": "cpu_load > load_threshold", diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json index 4c21a6b..1d7e041 100644 --- a/internal/tagger/jobclasses/lowload.json +++ b/internal/tagger/jobclasses/lowload.json @@ -21,6 +21,6 @@ "expr": "1.0 - (cpu_load / load_threshold)" } ], - "rule": "cpu_load < load_threshold", + "rule": "cpu_load.avg < load_threshold", "hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.load_threshold}}." } From 4a5fd96b329825649581af7b94a754be3e0acab9 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 28 May 2025 14:32:49 +0200 Subject: [PATCH 31/45] Adapt job class rules --- internal/tagger/jobclasses/highload.json | 7 +++---- internal/tagger/jobclasses/lowload.json | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index 444ca4d..bab37be 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -1,7 +1,6 @@ { "name": "Excessive CPU load", "tag": "excessiveload", - "comment": "Assumptions: all nodes have the same number of cores.", "parameters": [ "excessivecpuload_threshold_factor", "job_min_duration_seconds", @@ -19,9 +18,9 @@ }, { "name": "load_perc", - "expr": "cpu_load.avg / load_threshold" + "expr": "cpu_load.avg / cpu_load.limits.peak" } ], - "rule": "cpu_load > load_threshold", - "hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load}} falls above the threshold {{.load_threshold}}." + "rule": "cpu_load.avg > cpu_load.limits.peak", + "hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load.avg}} falls above the threshold {{.cpu_load.limits.peak}}." } diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json index 1d7e041..2212bd1 100644 --- a/internal/tagger/jobclasses/lowload.json +++ b/internal/tagger/jobclasses/lowload.json @@ -18,9 +18,9 @@ }, { "name": "load_perc", - "expr": "1.0 - (cpu_load / load_threshold)" + "expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)" } ], - "rule": "cpu_load.avg < load_threshold", - "hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.load_threshold}}." + "rule": "cpu_load.avg < cpu_load.limits.caution", + "hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.cpu_load.limits.caution}}." } From e35cfbc3dd04c1cb7231fec7341c2a519db54d5e Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 28 May 2025 14:32:56 +0200 Subject: [PATCH 32/45] Refactor --- pkg/schema/job.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pkg/schema/job.go b/pkg/schema/job.go index df901b4..7475c36 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -68,6 +68,14 @@ type Job struct { // *int64 `json:"id,omitempty"` >> never used in the job-archive, only // available via REST-API // +// JobMeta model +// @Description Meta data information of a HPC job. +type JobMeta struct { + ID *int64 `json:"id,omitempty"` + Statistics map[string]JobStatistics `json:"statistics"` + BaseJob + StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` +} type JobLink struct { ID int64 `json:"id"` @@ -79,15 +87,6 @@ type JobLinkResultList struct { Count int `json:"count"` } -// JobMeta model -// @Description Meta data information of a HPC job. -type JobMeta struct { - ID *int64 `json:"id,omitempty"` - Statistics map[string]JobStatistics `json:"statistics"` - BaseJob - StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` -} - const ( MonitoringStatusDisabled int32 = 0 MonitoringStatusRunningOrArchiving int32 = 1 From eef48ac3a31af49da89f2e47a4d9a26f6a444bde Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 28 May 2025 14:33:52 +0200 Subject: [PATCH 33/45] Small fix in highload rule --- internal/tagger/jobclasses/highload.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index bab37be..01476c1 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -18,7 +18,7 @@ }, { "name": "load_perc", - "expr": "cpu_load.avg / cpu_load.limits.peak" + "expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)" } ], "rule": "cpu_load.avg > cpu_load.limits.peak", From 3efee2253678f51a1d4558ca0cf01922064619ca Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 28 May 2025 15:59:21 +0200 Subject: [PATCH 34/45] Remove jobMeta and use job struct everywhere --- go.mod | 4 - go.sum | 12 - internal/api/api_test.go | 2 +- internal/api/rest.go | 57 +- internal/archiver/archiveWorker.go | 4 +- internal/archiver/archiver.go | 14 +- internal/graph/schema.resolvers.go | 6 +- internal/importer/handleImport.go | 7 +- internal/importer/initDB.go | 31 +- internal/metricdata/cc-metric-store.go | 15 +- internal/metricdata/influxdb-v2.go | 575 ------------------ internal/metricdata/metricdata.go | 2 - internal/metricdata/prometheus.go | 9 +- internal/repository/job.go | 9 +- internal/repository/jobCreate.go | 4 +- internal/repository/jobFind.go | 2 +- internal/repository/job_test.go | 2 +- internal/repository/stats.go | 3 +- internal/repository/testdata/job.db | Bin 118784 -> 118784 bytes internal/tagger/classifyJob.go | 2 +- internal/tagger/detectApp.go | 2 +- .../taskManager/updateFootprintService.go | 10 +- pkg/archive/archive.go | 9 +- pkg/archive/archive_test.go | 5 +- pkg/archive/clusterConfig.go | 2 +- pkg/archive/fsBackend.go | 30 +- pkg/archive/fsBackend_test.go | 29 +- pkg/archive/json.go | 6 +- pkg/schema/job.go | 95 ++- 29 files changed, 163 insertions(+), 785 deletions(-) delete mode 100644 internal/metricdata/influxdb-v2.go diff --git a/go.mod b/go.mod index 6c92171..f55412d 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,6 @@ require ( github.com/gorilla/handlers v1.5.2 github.com/gorilla/mux v1.8.1 github.com/gorilla/sessions v1.4.0 - github.com/influxdata/influxdb-client-go/v2 v2.14.0 github.com/jmoiron/sqlx v1.4.0 github.com/joho/godotenv v1.5.1 github.com/mattn/go-sqlite3 v1.14.24 @@ -42,7 +41,6 @@ require ( github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect github.com/KyleBanks/depth v1.2.1 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect - github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect @@ -60,7 +58,6 @@ require ( github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect - github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect github.com/jonboulle/clockwork v0.5.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect @@ -72,7 +69,6 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect - github.com/oapi-codegen/runtime v1.1.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index b4c3781..a935407 100644 --- a/go.sum +++ b/go.sum @@ -16,7 +16,6 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0= github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U= -github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa h1:LHTHcTQiSGT7VVbI0o4wBRNQIgn917usHWOd6VAffYI= @@ -25,13 +24,10 @@ github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNg github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= -github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ= -github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/coreos/go-oidc/v3 v3.12.0 h1:sJk+8G2qq94rDI6ehZ71Bol3oUHy63qNYmkiSjrc/Jo= @@ -123,10 +119,6 @@ github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/C github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= -github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjwJdUHnwvfjMF71M1iI4= -github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI= -github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= -github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs= github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= @@ -151,7 +143,6 @@ github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2E github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE= github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -186,8 +177,6 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro= -github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= @@ -219,7 +208,6 @@ github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4= github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg= -github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 3af37ad..a938cb6 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -278,7 +278,7 @@ func TestRestApi(t *testing.T) { job.MonitoringStatus != 1 || job.SMT != 1 || !reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) || - job.StartTime.Unix() != 123456789 { + job.StartTime != 123456789 { t.Fatalf("unexpected job properties: %#v", job) } diff --git a/internal/api/rest.go b/internal/api/rest.go index fe35942..31a5979 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -150,9 +150,9 @@ type DeleteJobApiRequest struct { // GetJobsApiResponse model type GetJobsApiResponse struct { - Jobs []*schema.JobMeta `json:"jobs"` // Array of jobs - Items int `json:"items"` // Number of jobs returned - Page int `json:"page"` // Page id returned + Jobs []*schema.Job `json:"jobs"` // Array of jobs + Items int `json:"items"` // Number of jobs returned + Page int `json:"page"` // Page id returned } // GetClustersApiResponse model @@ -361,7 +361,7 @@ func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) { return } - results := make([]*schema.JobMeta, 0, len(jobs)) + results := make([]*schema.Job, 0, len(jobs)) for _, job := range jobs { if withMetadata { if _, err = api.JobRepository.FetchMetadata(job); err != nil { @@ -370,27 +370,21 @@ func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) { } } - res := &schema.JobMeta{ - ID: &job.ID, - BaseJob: job.BaseJob, - StartTime: job.StartTime.Unix(), - } - - res.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) if err != nil { handleError(err, http.StatusInternalServerError, rw) return } - if res.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful { - res.Statistics, err = archive.GetStatistics(job) + if job.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful { + job.Statistics, err = archive.GetStatistics(job) if err != nil { handleError(err, http.StatusInternalServerError, rw) return } } - results = append(results, res) + results = append(results, job) } log.Debugf("/api/jobs: %d jobs returned", len(results)) @@ -449,7 +443,7 @@ func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request) return } - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) if err != nil { handleError(err, http.StatusInternalServerError, rw) return @@ -542,7 +536,7 @@ func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) { return } - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) if err != nil { handleError(err, http.StatusInternalServerError, rw) return @@ -683,7 +677,7 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) { return } - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) if err != nil { http.Error(rw, err.Error(), http.StatusInternalServerError) return @@ -696,7 +690,7 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) { } for _, tag := range req { - tagId, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), job.ID, tag.Type, tag.Name, tag.Scope) + tagId, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), *job.ID, tag.Type, tag.Name, tag.Scope) if err != nil { http.Error(rw, err.Error(), http.StatusInternalServerError) return @@ -745,7 +739,7 @@ func (api *RestApi) removeTagJob(rw http.ResponseWriter, r *http.Request) { return } - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) if err != nil { http.Error(rw, err.Error(), http.StatusInternalServerError) return @@ -764,7 +758,7 @@ func (api *RestApi) removeTagJob(rw http.ResponseWriter, r *http.Request) { continue } - remainingTags, err := api.JobRepository.RemoveJobTagByRequest(repository.GetUserFromContext(r.Context()), job.ID, rtag.Type, rtag.Name, rtag.Scope) + remainingTags, err := api.JobRepository.RemoveJobTagByRequest(repository.GetUserFromContext(r.Context()), *job.ID, rtag.Type, rtag.Name, rtag.Scope) if err != nil { http.Error(rw, err.Error(), http.StatusInternalServerError) return @@ -840,7 +834,10 @@ func (api *RestApi) removeTags(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /api/jobs/start_job/ [post] func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { - req := schema.JobMeta{BaseJob: schema.JobDefaults} + req := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } if err := decode(r.Body, &req); err != nil { handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) return @@ -849,7 +846,7 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { log.Printf("REST: %s\n", req.GoString()) req.State = schema.JobStateRunning - if err := importer.SanityChecks(&req.BaseJob); err != nil { + if err := importer.SanityChecks(&req); err != nil { handleError(err, http.StatusBadRequest, rw) return } @@ -866,7 +863,7 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { return } else if err == nil { for _, job := range jobs { - if (req.StartTime - job.StartTimeUnix) < 86400 { + if (req.StartTime - job.StartTime) < 86400 { handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw) return } @@ -1023,7 +1020,7 @@ func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) return } - err = api.JobRepository.DeleteJobById(job.ID) + err = api.JobRepository.DeleteJobById(*job.ID) if err != nil { handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) return @@ -1087,8 +1084,8 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo return } - if job == nil || job.StartTime.Unix() > req.StopTime { - handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime.Unix()), http.StatusBadRequest, rw) + if job == nil || job.StartTime > req.StopTime { + handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw) return } @@ -1100,11 +1097,11 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo } // Mark job as stopped in the database (update state and duration) - job.Duration = int32(req.StopTime - job.StartTime.Unix()) + job.Duration = int32(req.StopTime - job.StartTime) job.State = req.State api.JobRepository.Mutex.Lock() - if err := api.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - if err := api.JobRepository.StopCached(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { api.JobRepository.Mutex.Unlock() handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) return @@ -1112,7 +1109,7 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo } api.JobRepository.Mutex.Unlock() - log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) + log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) // Send a response (with status OK). This means that erros that happen from here on forward // can *NOT* be communicated to the client. If reading from a MetricDataRepository or diff --git a/internal/archiver/archiveWorker.go b/internal/archiver/archiveWorker.go index 42a60b9..e9f3dc9 100644 --- a/internal/archiver/archiveWorker.go +++ b/internal/archiver/archiveWorker.go @@ -41,7 +41,7 @@ func archivingWorker() { // will fail if job meta not in repository if _, err := jobRepo.FetchMetadata(job); err != nil { log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error()) - jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) + jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed) continue } @@ -50,7 +50,7 @@ func archivingWorker() { jobMeta, err := ArchiveJob(job, context.Background()) if err != nil { log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error()) - jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) + jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed) continue } diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 1050ca1..b220d3b 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -16,7 +16,7 @@ import ( ) // Writes a running job to the job-archive -func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { +func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) { allMetrics := make([]string, 0) metricConfigs := archive.GetCluster(job.Cluster).MetricConfig for _, mc := range metricConfigs { @@ -40,11 +40,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { return nil, err } - jobMeta := &schema.JobMeta{ - BaseJob: job.BaseJob, - StartTime: job.StartTime.Unix(), - Statistics: make(map[string]schema.JobStatistics), - } + job.Statistics = make(map[string]schema.JobStatistics) for metric, data := range jobData { avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 @@ -61,7 +57,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { } // Round AVG Result to 2 Digits - jobMeta.Statistics[metric] = schema.JobStatistics{ + job.Statistics[metric] = schema.JobStatistics{ Unit: schema.Unit{ Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, @@ -76,8 +72,8 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { // only return the JobMeta structure as the // statistics in there are needed. if config.Keys.DisableArchive { - return jobMeta, nil + return job, nil } - return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData) + return job, archive.GetHandle().ImportJob(job, &jobData) } diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 7e52b3d..ad2b050 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -31,7 +31,7 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) ( // Tags is the resolver for the tags field. func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) { - return r.Repo.GetTags(repository.GetUserFromContext(ctx), &obj.ID) + return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID) } // ConcurrentJobs is the resolver for the concurrentJobs field. @@ -615,9 +615,9 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job numThreadsInt := int(job.NumHWThreads) numAccsInt := int(job.NumAcc) res = append(res, &model.JobStats{ - ID: int(job.ID), + ID: int(*job.ID), JobID: strconv.Itoa(int(job.JobID)), - StartTime: int(job.StartTime.Unix()), + StartTime: int(job.StartTime), Duration: int(job.Duration), Cluster: job.Cluster, SubCluster: job.SubCluster, diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index 623291c..83230f5 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -42,7 +42,10 @@ func HandleImportFlag(flag string) error { } dec := json.NewDecoder(bytes.NewReader(raw)) dec.DisallowUnknownFields() - job := schema.JobMeta{BaseJob: schema.JobDefaults} + job := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } if err = dec.Decode(&job); err != nil { log.Warn("Error while decoding raw json metadata for import") return err @@ -141,7 +144,7 @@ func HandleImportFlag(flag string) error { return err } - if err = SanityChecks(&job.BaseJob); err != nil { + if err = SanityChecks(&job); err != nil { log.Warn("BaseJob SanityChecks failed") return err } diff --git a/internal/importer/initDB.go b/internal/importer/initDB.go index 9a2ccdf..1239951 100644 --- a/internal/importer/initDB.go +++ b/internal/importer/initDB.go @@ -60,11 +60,6 @@ func InitDB() error { } jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful - job := schema.Job{ - BaseJob: jobMeta.BaseJob, - StartTime: time.Unix(jobMeta.StartTime, 0), - StartTimeUnix: jobMeta.StartTime, - } sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) if err != nil { @@ -72,7 +67,7 @@ func InitDB() error { return err } - job.Footprint = make(map[string]float64) + jobMeta.Footprint = make(map[string]float64) for _, fp := range sc.Footprint { statType := "avg" @@ -83,16 +78,16 @@ func InitDB() error { name := fmt.Sprintf("%s_%s", fp, statType) - job.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType) + jobMeta.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType) } - job.RawFootprint, err = json.Marshal(job.Footprint) + jobMeta.RawFootprint, err = json.Marshal(jobMeta.Footprint) if err != nil { log.Warn("Error while marshaling job footprint") return err } - job.EnergyFootprint = make(map[string]float64) + jobMeta.EnergyFootprint = make(map[string]float64) // Total Job Energy Outside Loop totalEnergy := 0.0 @@ -117,45 +112,45 @@ func InitDB() error { log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID) } - job.EnergyFootprint[fp] = metricEnergy + jobMeta.EnergyFootprint[fp] = metricEnergy totalEnergy += metricEnergy } - job.Energy = (math.Round(totalEnergy*100.0) / 100.0) - if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil { + jobMeta.Energy = (math.Round(totalEnergy*100.0) / 100.0) + if jobMeta.RawEnergyFootprint, err = json.Marshal(jobMeta.EnergyFootprint); err != nil { log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID) return err } - job.RawResources, err = json.Marshal(job.Resources) + jobMeta.RawResources, err = json.Marshal(jobMeta.Resources) if err != nil { log.Errorf("repository initDB(): %v", err) errorOccured++ continue } - job.RawMetaData, err = json.Marshal(job.MetaData) + jobMeta.RawMetaData, err = json.Marshal(jobMeta.MetaData) if err != nil { log.Errorf("repository initDB(): %v", err) errorOccured++ continue } - if err := SanityChecks(&job.BaseJob); err != nil { + if err := SanityChecks(jobMeta); err != nil { log.Errorf("repository initDB(): %v", err) errorOccured++ continue } id, err := r.TransactionAddNamed(t, - repository.NamedJobInsert, job) + repository.NamedJobInsert, jobMeta) if err != nil { log.Errorf("repository initDB(): %v", err) errorOccured++ continue } - for _, tag := range job.Tags { + for _, tag := range jobMeta.Tags { tagstr := tag.Name + ":" + tag.Type tagId, ok := tags[tagstr] if !ok { @@ -190,7 +185,7 @@ func InitDB() error { } // This function also sets the subcluster if necessary! -func SanityChecks(job *schema.BaseJob) error { +func SanityChecks(job *schema.Job) error { if c := archive.GetCluster(job.Cluster); c == nil { return fmt.Errorf("no such cluster: %v", job.Cluster) } diff --git a/internal/metricdata/cc-metric-store.go b/internal/metricdata/cc-metric-store.go index 7c84d93..557e1d2 100644 --- a/internal/metricdata/cc-metric-store.go +++ b/internal/metricdata/cc-metric-store.go @@ -183,8 +183,8 @@ func (ccms *CCMetricStore) LoadData( req := ApiQueryRequest{ Cluster: job.Cluster, - From: job.StartTime.Unix(), - To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), + From: job.StartTime, + To: job.StartTime + int64(job.Duration), Queries: queries, WithStats: true, WithData: true, @@ -570,7 +570,6 @@ func (ccms *CCMetricStore) LoadStats( metrics []string, ctx context.Context, ) (map[string]map[string]schema.MetricStatistics, error) { - queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization? if err != nil { log.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error()) @@ -579,8 +578,8 @@ func (ccms *CCMetricStore) LoadStats( req := ApiQueryRequest{ Cluster: job.Cluster, - From: job.StartTime.Unix(), - To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), + From: job.StartTime, + To: job.StartTime + int64(job.Duration), Queries: queries, WithStats: true, WithData: false, @@ -638,8 +637,8 @@ func (ccms *CCMetricStore) LoadScopedStats( req := ApiQueryRequest{ Cluster: job.Cluster, - From: job.StartTime.Unix(), - To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), + From: job.StartTime, + To: job.StartTime + int64(job.Duration), Queries: queries, WithStats: true, WithData: false, @@ -816,7 +815,6 @@ func (ccms *CCMetricStore) LoadNodeListData( page *model.PageRequest, ctx context.Context, ) (map[string]schema.JobData, int, bool, error) { - // 0) Init additional vars var totalNodes int = 0 var hasNextPage bool = false @@ -975,7 +973,6 @@ func (ccms *CCMetricStore) buildNodeQueries( scopes []schema.MetricScope, resolution int, ) ([]ApiQuery, []schema.MetricScope, error) { - queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(nodes)) assignedScope := []schema.MetricScope{} diff --git a/internal/metricdata/influxdb-v2.go b/internal/metricdata/influxdb-v2.go deleted file mode 100644 index c53dad3..0000000 --- a/internal/metricdata/influxdb-v2.go +++ /dev/null @@ -1,575 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. -package metricdata - -import ( - "context" - "crypto/tls" - "encoding/json" - "errors" - "fmt" - "math" - "sort" - "strings" - "time" - - "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/pkg/archive" - "github.com/ClusterCockpit/cc-backend/pkg/log" - "github.com/ClusterCockpit/cc-backend/pkg/schema" - influxdb2 "github.com/influxdata/influxdb-client-go/v2" - influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" -) - -type InfluxDBv2DataRepositoryConfig struct { - Url string `json:"url"` - Token string `json:"token"` - Bucket string `json:"bucket"` - Org string `json:"org"` - SkipTls bool `json:"skiptls"` -} - -type InfluxDBv2DataRepository struct { - client influxdb2.Client - queryClient influxdb2Api.QueryAPI - bucket, measurement string -} - -func (idb *InfluxDBv2DataRepository) Init(rawConfig json.RawMessage) error { - var config InfluxDBv2DataRepositoryConfig - if err := json.Unmarshal(rawConfig, &config); err != nil { - log.Warn("Error while unmarshaling raw json config") - return err - } - - idb.client = influxdb2.NewClientWithOptions(config.Url, config.Token, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: config.SkipTls})) - idb.queryClient = idb.client.QueryAPI(config.Org) - idb.bucket = config.Bucket - - return nil -} - -func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string { - return t.Format(time.RFC3339) // Like “2006-01-02T15:04:05Z07:00” -} - -func (idb *InfluxDBv2DataRepository) epochToTime(epoch int64) time.Time { - return time.Unix(epoch, 0) -} - -func (idb *InfluxDBv2DataRepository) LoadData( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, - resolution int) (schema.JobData, error) { - - log.Infof("InfluxDB 2 Backend: Resolution Scaling not Implemented, will return default timestep. Requested Resolution %d", resolution) - - measurementsConds := make([]string, 0, len(metrics)) - for _, m := range metrics { - measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m)) - } - measurementsCond := strings.Join(measurementsConds, " or ") - - hostsConds := make([]string, 0, len(job.Resources)) - for _, h := range job.Resources { - if h.HWThreads != nil || h.Accelerators != nil { - // TODO - return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators") - } - hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname)) - } - hostsCond := strings.Join(hostsConds, " or ") - - jobData := make(schema.JobData) // Empty Schema: map[FIELD]map[SCOPE]<*JobMetric>METRIC - // Requested Scopes - for _, scope := range scopes { - query := "" - switch scope { - case "node": - // Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows <-- Resolution could be added here? - // log.Info("Scope 'node' requested. ") - query = fmt.Sprintf(` - from(bucket: "%s") - |> range(start: %s, stop: %s) - |> filter(fn: (r) => (%s) and (%s) ) - |> drop(columns: ["_start", "_stop"]) - |> group(columns: ["hostname", "_measurement"]) - |> aggregateWindow(every: 60s, fn: mean) - |> drop(columns: ["_time"])`, - idb.bucket, - idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))), - measurementsCond, hostsCond) - case "socket": - log.Info("Scope 'socket' requested, but not yet supported: Will return 'node' scope only. ") - continue - case "core": - log.Info(" Scope 'core' requested, but not yet supported: Will return 'node' scope only. ") - continue - // Get Finest Granularity only, Set NULL to 0.0 - // query = fmt.Sprintf(` - // from(bucket: "%s") - // |> range(start: %s, stop: %s) - // |> filter(fn: (r) => %s ) - // |> filter(fn: (r) => %s ) - // |> drop(columns: ["_start", "_stop", "cluster"]) - // |> map(fn: (r) => (if exists r._value then {r with _value: r._value} else {r with _value: 0.0}))`, - // idb.bucket, - // idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )), - // measurementsCond, hostsCond) - case "hwthread": - log.Info(" Scope 'hwthread' requested, but not yet supported: Will return 'node' scope only. ") - continue - case "accelerator": - log.Info(" Scope 'accelerator' requested, but not yet supported: Will return 'node' scope only. ") - continue - default: - log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope) - continue - // return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support other scopes than 'node'") - } - - rows, err := idb.queryClient.Query(ctx, query) - if err != nil { - log.Error("Error while performing query") - return nil, err - } - - // Init Metrics: Only Node level now -> TODO: Matching /check on scope level ... - for _, metric := range metrics { - jobMetric, ok := jobData[metric] - if !ok { - mc := archive.GetMetricConfig(job.Cluster, metric) - jobMetric = map[schema.MetricScope]*schema.JobMetric{ - scope: { // uses scope var from above! - Unit: mc.Unit, - Timestep: mc.Timestep, - Series: make([]schema.Series, 0, len(job.Resources)), - StatisticsSeries: nil, // Should be: &schema.StatsSeries{}, - }, - } - } - jobData[metric] = jobMetric - } - - // Process Result: Time-Data - field, host, hostSeries := "", "", schema.Series{} - // typeId := 0 - switch scope { - case "node": - for rows.Next() { - row := rows.Record() - if host == "" || host != row.ValueByKey("hostname").(string) || rows.TableChanged() { - if host != "" { - // Append Series before reset - jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries) - } - field, host = row.Measurement(), row.ValueByKey("hostname").(string) - hostSeries = schema.Series{ - Hostname: host, - Statistics: schema.MetricStatistics{}, //TODO Add Statistics - Data: make([]schema.Float, 0), - } - } - val, ok := row.Value().(float64) - if ok { - hostSeries.Data = append(hostSeries.Data, schema.Float(val)) - } else { - hostSeries.Data = append(hostSeries.Data, schema.Float(0)) - } - } - case "socket": - continue - case "accelerator": - continue - case "hwthread": - // See below @ core - continue - case "core": - continue - // Include Series.Id in hostSeries - // for rows.Next() { - // row := rows.Record() - // if ( host == "" || host != row.ValueByKey("hostname").(string) || typeId != row.ValueByKey("type-id").(int) || rows.TableChanged() ) { - // if ( host != "" ) { - // // Append Series before reset - // jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries) - // } - // field, host, typeId = row.Measurement(), row.ValueByKey("hostname").(string), row.ValueByKey("type-id").(int) - // hostSeries = schema.Series{ - // Hostname: host, - // Id: &typeId, - // Statistics: nil, - // Data: make([]schema.Float, 0), - // } - // } - // val := row.Value().(float64) - // hostSeries.Data = append(hostSeries.Data, schema.Float(val)) - // } - default: - log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope) - continue - // return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node, core'") - } - // Append last Series - jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries) - } - - // Get Stats - stats, err := idb.LoadStats(job, metrics, ctx) - if err != nil { - log.Warn("Error while loading statistics") - return nil, err - } - - for _, scope := range scopes { - if scope == "node" { // No 'socket/core' support yet - for metric, nodes := range stats { - for node, stats := range nodes { - for index, _ := range jobData[metric][scope].Series { - if jobData[metric][scope].Series[index].Hostname == node { - jobData[metric][scope].Series[index].Statistics = schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max} - } - } - } - } - } - } - - return jobData, nil -} - -func (idb *InfluxDBv2DataRepository) LoadStats( - job *schema.Job, - metrics []string, - ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) { - - stats := map[string]map[string]schema.MetricStatistics{} - - hostsConds := make([]string, 0, len(job.Resources)) - for _, h := range job.Resources { - if h.HWThreads != nil || h.Accelerators != nil { - // TODO - return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators") - } - hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname)) - } - hostsCond := strings.Join(hostsConds, " or ") - - // lenMet := len(metrics) - - for _, metric := range metrics { - // log.Debugf("<< You are here: %s (Index %d of %d metrics)", metric, index, lenMet) - - query := fmt.Sprintf(` - data = from(bucket: "%s") - |> range(start: %s, stop: %s) - |> filter(fn: (r) => r._measurement == "%s" and r._field == "value" and (%s)) - union(tables: [data |> mean(column: "_value") |> set(key: "_field", value: "avg"), - data |> min(column: "_value") |> set(key: "_field", value: "min"), - data |> max(column: "_value") |> set(key: "_field", value: "max")]) - |> pivot(rowKey: ["hostname"], columnKey: ["_field"], valueColumn: "_value") - |> group()`, - idb.bucket, - idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))), - metric, hostsCond) - - rows, err := idb.queryClient.Query(ctx, query) - if err != nil { - log.Error("Error while performing query") - return nil, err - } - - nodes := map[string]schema.MetricStatistics{} - for rows.Next() { - row := rows.Record() - host := row.ValueByKey("hostname").(string) - - avg, avgok := row.ValueByKey("avg").(float64) - if !avgok { - // log.Debugf(">> Assertion error for metric %s, statistic AVG. Expected 'float64', got %v", metric, avg) - avg = 0.0 - } - min, minok := row.ValueByKey("min").(float64) - if !minok { - // log.Debugf(">> Assertion error for metric %s, statistic MIN. Expected 'float64', got %v", metric, min) - min = 0.0 - } - max, maxok := row.ValueByKey("max").(float64) - if !maxok { - // log.Debugf(">> Assertion error for metric %s, statistic MAX. Expected 'float64', got %v", metric, max) - max = 0.0 - } - - nodes[host] = schema.MetricStatistics{ - Avg: avg, - Min: min, - Max: max, - } - } - stats[metric] = nodes - } - - return stats, nil -} - -// Used in Job-View StatsTable -// UNTESTED -func (idb *InfluxDBv2DataRepository) LoadScopedStats( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context) (schema.ScopedJobStats, error) { - - // Assumption: idb.loadData() only returns series node-scope - use node scope for statsTable - scopedJobStats := make(schema.ScopedJobStats) - data, err := idb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/) - if err != nil { - log.Warn("Error while loading job for scopedJobStats") - return nil, err - } - - for metric, metricData := range data { - for _, scope := range scopes { - if scope != schema.MetricScopeNode { - logOnce.Do(func() { - log.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope) - }) - continue - } - - if _, ok := scopedJobStats[metric]; !ok { - scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats) - } - - if _, ok := scopedJobStats[metric][scope]; !ok { - scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0) - } - - for _, series := range metricData[scope].Series { - scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ - Hostname: series.Hostname, - Data: &series.Statistics, - }) - } - } - } - - return scopedJobStats, nil -} - -// Used in Systems-View @ Node-Overview -// UNTESTED -func (idb *InfluxDBv2DataRepository) LoadNodeData( - cluster string, - metrics, nodes []string, - scopes []schema.MetricScope, - from, to time.Time, - ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) { - - // Note: scopes[] Array will be ignored, only return node scope - - // CONVERT ARGS TO INFLUX - measurementsConds := make([]string, 0) - for _, m := range metrics { - measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m)) - } - measurementsCond := strings.Join(measurementsConds, " or ") - - hostsConds := make([]string, 0) - if nodes == nil { - var allNodes []string - subClusterNodeLists := archive.NodeLists[cluster] - for _, nodeList := range subClusterNodeLists { - allNodes = append(nodes, nodeList.PrintList()...) - } - for _, node := range allNodes { - nodes = append(nodes, node) - hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, node)) - } - } else { - for _, node := range nodes { - hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, node)) - } - } - hostsCond := strings.Join(hostsConds, " or ") - - // BUILD AND PERFORM QUERY - query := fmt.Sprintf(` - from(bucket: "%s") - |> range(start: %s, stop: %s) - |> filter(fn: (r) => (%s) and (%s) ) - |> drop(columns: ["_start", "_stop"]) - |> group(columns: ["hostname", "_measurement"]) - |> aggregateWindow(every: 60s, fn: mean) - |> drop(columns: ["_time"])`, - idb.bucket, - idb.formatTime(from), idb.formatTime(to), - measurementsCond, hostsCond) - - rows, err := idb.queryClient.Query(ctx, query) - if err != nil { - log.Error("Error while performing query") - return nil, err - } - - // HANDLE QUERY RETURN - // Collect Float Arrays for Node@Metric -> No Scope Handling! - influxData := make(map[string]map[string][]schema.Float) - for rows.Next() { - row := rows.Record() - host, field := row.ValueByKey("hostname").(string), row.Measurement() - - influxHostData, ok := influxData[host] - if !ok { - influxHostData = make(map[string][]schema.Float) - influxData[host] = influxHostData - } - - influxFieldData, ok := influxData[host][field] - if !ok { - influxFieldData = make([]schema.Float, 0) - influxData[host][field] = influxFieldData - } - - val, ok := row.Value().(float64) - if ok { - influxData[host][field] = append(influxData[host][field], schema.Float(val)) - } else { - influxData[host][field] = append(influxData[host][field], schema.Float(0)) - } - } - - // BUILD FUNCTION RETURN - data := make(map[string]map[string][]*schema.JobMetric) - for node, metricData := range influxData { - - nodeData, ok := data[node] - if !ok { - nodeData = make(map[string][]*schema.JobMetric) - data[node] = nodeData - } - - for metric, floatArray := range metricData { - avg, min, max := 0.0, 0.0, 0.0 - for _, val := range floatArray { - avg += float64(val) - min = math.Min(min, float64(val)) - max = math.Max(max, float64(val)) - } - - stats := schema.MetricStatistics{ - Avg: (math.Round((avg/float64(len(floatArray)))*100) / 100), - Min: (math.Round(min*100) / 100), - Max: (math.Round(max*100) / 100), - } - - mc := archive.GetMetricConfig(cluster, metric) - nodeData[metric] = append(nodeData[metric], &schema.JobMetric{ - Unit: mc.Unit, - Timestep: mc.Timestep, - Series: []schema.Series{ - { - Hostname: node, - Statistics: stats, - Data: floatArray, - }, - }, - }) - } - } - - return data, nil -} - -// Used in Systems-View @ Node-List -// UNTESTED -func (idb *InfluxDBv2DataRepository) LoadNodeListData( - cluster, subCluster, nodeFilter string, - metrics []string, - scopes []schema.MetricScope, - resolution int, - from, to time.Time, - page *model.PageRequest, - ctx context.Context, -) (map[string]schema.JobData, int, bool, error) { - - // Assumption: idb.loadData() only returns series node-scope - use node scope for NodeList - - // 0) Init additional vars - var totalNodes int = 0 - var hasNextPage bool = false - - // 1) Get list of all nodes - var nodes []string - if subCluster != "" { - scNodes := archive.NodeLists[cluster][subCluster] - nodes = scNodes.PrintList() - } else { - subClusterNodeLists := archive.NodeLists[cluster] - for _, nodeList := range subClusterNodeLists { - nodes = append(nodes, nodeList.PrintList()...) - } - } - - // 2) Filter nodes - if nodeFilter != "" { - filteredNodes := []string{} - for _, node := range nodes { - if strings.Contains(node, nodeFilter) { - filteredNodes = append(filteredNodes, node) - } - } - nodes = filteredNodes - } - - // 2.1) Count total nodes && Sort nodes -> Sorting invalidated after return ... - totalNodes = len(nodes) - sort.Strings(nodes) - - // 3) Apply paging - if len(nodes) > page.ItemsPerPage { - start := (page.Page - 1) * page.ItemsPerPage - end := start + page.ItemsPerPage - if end > len(nodes) { - end = len(nodes) - hasNextPage = false - } else { - hasNextPage = true - } - nodes = nodes[start:end] - } - - // 4) Fetch And Convert Data, use idb.LoadNodeData() for query - - rawNodeData, err := idb.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) - if err != nil { - log.Error(fmt.Sprintf("Error while loading influx nodeData for nodeListData %#v\n", err)) - return nil, totalNodes, hasNextPage, err - } - - data := make(map[string]schema.JobData) - for node, nodeData := range rawNodeData { - // Init Nested Map Data Structures If Not Found - hostData, ok := data[node] - if !ok { - hostData = make(schema.JobData) - data[node] = hostData - } - - for metric, nodeMetricData := range nodeData { - metricData, ok := hostData[metric] - if !ok { - metricData = make(map[schema.MetricScope]*schema.JobMetric) - data[node][metric] = metricData - } - - data[node][metric][schema.MetricScopeNode] = nodeMetricData[0] // Only Node Scope Returned from loadNodeData - } - } - - return data, totalNodes, hasNextPage, nil -} diff --git a/internal/metricdata/metricdata.go b/internal/metricdata/metricdata.go index f30d837..e6b739a 100644 --- a/internal/metricdata/metricdata.go +++ b/internal/metricdata/metricdata.go @@ -54,8 +54,6 @@ func Init() error { switch kind.Kind { case "cc-metric-store": mdr = &CCMetricStore{} - case "influxdb": - mdr = &InfluxDBv2DataRepository{} case "prometheus": mdr = &PrometheusDataRepository{} case "test": diff --git a/internal/metricdata/prometheus.go b/internal/metricdata/prometheus.go index d16501e..fa49764 100644 --- a/internal/metricdata/prometheus.go +++ b/internal/metricdata/prometheus.go @@ -279,8 +279,8 @@ func (pdb *PrometheusDataRepository) LoadData( for i, resource := range job.Resources { nodes[i] = resource.Hostname } - from := job.StartTime - to := job.StartTime.Add(time.Duration(job.Duration) * time.Second) + from := time.Unix(job.StartTime, 0) + to := time.Unix(job.StartTime+int64(job.Duration), 0) for _, scope := range scopes { if scope != schema.MetricScopeNode { @@ -453,8 +453,8 @@ func (pdb *PrometheusDataRepository) LoadScopedStats( job *schema.Job, metrics []string, scopes []schema.MetricScope, - ctx context.Context) (schema.ScopedJobStats, error) { - + ctx context.Context, +) (schema.ScopedJobStats, error) { // Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable scopedJobStats := make(schema.ScopedJobStats) data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/) @@ -502,7 +502,6 @@ func (pdb *PrometheusDataRepository) LoadNodeListData( page *model.PageRequest, ctx context.Context, ) (map[string]schema.JobData, int, bool, error) { - // Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList // 0) Init additional vars diff --git a/internal/repository/job.go b/internal/repository/job.go index 3702099..c800141 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -73,7 +73,7 @@ func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { if err := row.Scan( &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, - &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, &job.NumNodes, &job.NumHWThreads, + &job.StartTime, &job.Partition, &job.ArrayJobId, &job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, &job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint, &job.Energy); err != nil { log.Warnf("Error while scanning rows (Job): %v", err) @@ -92,10 +92,9 @@ func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { } job.RawFootprint = nil - job.StartTime = time.Unix(job.StartTimeUnix, 0) // Always ensure accurate duration for running jobs if job.State == schema.JobStateRunning { - job.Duration = int32(time.Since(job.StartTime).Seconds()) + job.Duration = int32(time.Now().Unix() - job.StartTime) } return job, nil @@ -582,7 +581,7 @@ func (r *JobRepository) MarkArchived( func (r *JobRepository) UpdateEnergy( stmt sq.UpdateBuilder, - jobMeta *schema.JobMeta, + jobMeta *schema.Job, ) (sq.UpdateBuilder, error) { /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) @@ -632,7 +631,7 @@ func (r *JobRepository) UpdateEnergy( func (r *JobRepository) UpdateFootprint( stmt sq.UpdateBuilder, - jobMeta *schema.JobMeta, + jobMeta *schema.Job, ) (sq.UpdateBuilder, error) { /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index f286c68..1508c8d 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -29,7 +29,7 @@ const NamedJobInsert string = `INSERT INTO job ( :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data );` -func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { +func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) { r.Mutex.Lock() res, err := r.DB.NamedExec(NamedJobCacheInsert, job) r.Mutex.Unlock() @@ -87,7 +87,7 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { // Start inserts a new job in the table, returning the unique job ID. // Statistics are not transfered! -func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) { +func (r *JobRepository) Start(job *schema.Job) (id int64, err error) { job.RawFootprint, err = json.Marshal(job.Footprint) if err != nil { return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err) diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index b820084..2acdb87 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -227,7 +227,7 @@ func (r *JobRepository) FindConcurrentJobs( var startTime int64 var stopTime int64 - startTime = job.StartTimeUnix + startTime = job.StartTime hostname := job.Resources[0].Hostname if job.State == schema.JobStateRunning { diff --git a/internal/repository/job_test.go b/internal/repository/job_test.go index 363bb6c..bf7abd9 100644 --- a/internal/repository/job_test.go +++ b/internal/repository/job_test.go @@ -24,7 +24,7 @@ func TestFind(t *testing.T) { // fmt.Printf("%+v", job) - if job.ID != 5 { + if *job.ID != 5 { t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1366", job.JobID) } } diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 410ba6c..7a5078f 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -291,7 +291,7 @@ func (r *JobRepository) JobsStats( return stats, nil } -func LoadJobStat(job *schema.JobMeta, metric string, statType string) float64 { +func LoadJobStat(job *schema.Job, metric string, statType string) float64 { if stats, ok := job.Statistics[metric]; ok { switch statType { case "avg": @@ -759,7 +759,6 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram( filters []*model.JobFilter, bins *int, ) []*model.MetricHistoPoints { - // Get Jobs jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil) if err != nil { diff --git a/internal/repository/testdata/job.db b/internal/repository/testdata/job.db index 43ec9d3c7f36c7ea505a96cc1208c4ce7a148eed..c65dfd093bbf6f8e9cbedd58c8a4e9c19cc662ed 100644 GIT binary patch delta 1228 zcmZ`(PfXKL81HMlb{!jUIL4Sl_Qt=AWW)UdNx-OME`kzBa2h?;vNddEW2Ngryf8c| zC$TY{Oo-k*GNTs{n&^eZgb+P=G8(-Zgm^MB(f7JBF~cTb``+)*_kHhsNlP1iX@kFa zqN$alsP<|I_&dDjk6^Ix*1QStC_8ykwl$@hWmnnNa+7_9(FJye3w3a=xMzL)6lm*A zO3Ab=hGz5AqOr7Ki<$WDx|?{H!vXHQ^?9~?9SwrF=p)= zD7XWkz!EN?!YLoaQ4Y5|A}NZpq^0w7GuiyKlnn>N1A*W`AUueIgG1rSP=Bb0#!9E5 zT#yRNe|7rKWEhv(A!Rz6x@eD;SZeYGqiI&mIIohIs6q&^rMpfnF3gQncev1 z@!PPYk?Nt~1K86DZ11f7R)(M9&vC;X#eQSoP}ea@9lC-<-4)!)czKt{1J=BnG@X@e zcT-!iv8i}85l4yW$aoxe)bI}EvmpFrQaUiT_{frQnG_lsyD&D9KogTVu8fcS4Ouc< zENHTd^nJkviMUXlK8!UQm?o*3sAY1p{?OEABT!mNip4_JXeEfE=4a)kW@K6>KsAl1 zU~@*x^qOBvC<=G+HKY zI6I9a#zy0@OUQS?5npviq)Yuh*Jn0#WW(^0Wn;HfZ8#h_ Lnx4141Yh7EED1C} delta 599 zcmZozz}~QceS);0I0FNNEEMwqX{(7k#y~;6szP2MkB8+b1OHThwTT5Hn*{}0nJ06y zG)~rG;o<7^5M&n@7G|7OKDqF?>gM$0b67Sfa?TTDMJZtV5qtm2}wjA=!gB~`hJnR(fU z24<;Ah6Zdn#948Ovxu^a3d=Gk=A>36ni%936l5BhnP)I#6=f0uspSN#6+tKl`JR#g z5(EDwpvO<}Prl@D#Ky?~j)DIjkpF~#^1FC3F0co`@PFZd4;GVQ unkown cluster: %v", job.Cluster) diff --git a/pkg/archive/fsBackend.go b/pkg/archive/fsBackend.go index a59b663..a90c092 100644 --- a/pkg/archive/fsBackend.go +++ b/pkg/archive/fsBackend.go @@ -53,7 +53,7 @@ func getDirectory( rootPath, job.Cluster, lvl1, lvl2, - strconv.FormatInt(job.StartTime.Unix(), 10)) + strconv.FormatInt(job.StartTime, 10)) } func getPath( @@ -65,15 +65,15 @@ func getPath( getDirectory(job, rootPath), file) } -func loadJobMeta(filename string) (*schema.JobMeta, error) { +func loadJobMeta(filename string) (*schema.Job, error) { b, err := os.ReadFile(filename) if err != nil { log.Errorf("loadJobMeta() > open file error: %v", err) - return &schema.JobMeta{}, err + return nil, err } if config.Keys.Validate { if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil { - return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err) + return nil, fmt.Errorf("validate job meta: %v", err) } } @@ -429,7 +429,7 @@ func (fsa *FsArchive) LoadJobStats(job *schema.Job) (schema.ScopedJobStats, erro return loadJobStats(filename, isCompressed) } -func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) { +func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.Job, error) { filename := getPath(job, fsa.path, "meta.json") return loadJobMeta(filename) } @@ -518,18 +518,13 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { return ch } -func (fsa *FsArchive) StoreJobMeta(jobMeta *schema.JobMeta) error { - job := schema.Job{ - BaseJob: jobMeta.BaseJob, - StartTime: time.Unix(jobMeta.StartTime, 0), - StartTimeUnix: jobMeta.StartTime, - } - f, err := os.Create(getPath(&job, fsa.path, "meta.json")) +func (fsa *FsArchive) StoreJobMeta(job *schema.Job) error { + f, err := os.Create(getPath(job, fsa.path, "meta.json")) if err != nil { log.Error("Error while creating filepath for meta.json") return err } - if err := EncodeJobMeta(f, jobMeta); err != nil { + if err := EncodeJobMeta(f, job); err != nil { log.Error("Error while encoding job metadata to meta.json file") return err } @@ -546,15 +541,10 @@ func (fsa *FsArchive) GetClusters() []string { } func (fsa *FsArchive) ImportJob( - jobMeta *schema.JobMeta, + jobMeta *schema.Job, jobData *schema.JobData, ) error { - job := schema.Job{ - BaseJob: jobMeta.BaseJob, - StartTime: time.Unix(jobMeta.StartTime, 0), - StartTimeUnix: jobMeta.StartTime, - } - dir := getPath(&job, fsa.path, "") + dir := getPath(jobMeta, fsa.path, "") if err := os.MkdirAll(dir, 0777); err != nil { log.Error("Error while creating job archive path") return err diff --git a/pkg/archive/fsBackend_test.go b/pkg/archive/fsBackend_test.go index 9db68ed..ddb430a 100644 --- a/pkg/archive/fsBackend_test.go +++ b/pkg/archive/fsBackend_test.go @@ -9,7 +9,6 @@ import ( "fmt" "path/filepath" "testing" - "time" "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/schema" @@ -86,8 +85,11 @@ func TestLoadJobMeta(t *testing.T) { t.Fatal(err) } - jobIn := schema.Job{BaseJob: schema.JobDefaults} - jobIn.StartTime = time.Unix(1608923076, 0) + jobIn := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + jobIn.StartTime = 1608923076 jobIn.JobID = 1403244 jobIn.Cluster = "emmy" @@ -114,8 +116,11 @@ func TestLoadJobData(t *testing.T) { t.Fatal(err) } - jobIn := schema.Job{BaseJob: schema.JobDefaults} - jobIn.StartTime = time.Unix(1608923076, 0) + jobIn := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + jobIn.StartTime = 1608923076 jobIn.JobID = 1403244 jobIn.Cluster = "emmy" @@ -142,8 +147,11 @@ func BenchmarkLoadJobData(b *testing.B) { var fsa FsArchive fsa.Init(json.RawMessage(archiveCfg)) - jobIn := schema.Job{BaseJob: schema.JobDefaults} - jobIn.StartTime = time.Unix(1608923076, 0) + jobIn := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + jobIn.StartTime = 1608923076 jobIn.JobID = 1403244 jobIn.Cluster = "emmy" @@ -165,8 +173,11 @@ func BenchmarkLoadJobDataCompressed(b *testing.B) { var fsa FsArchive fsa.Init(json.RawMessage(archiveCfg)) - jobIn := schema.Job{BaseJob: schema.JobDefaults} - jobIn.StartTime = time.Unix(1608923076, 0) + jobIn := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + jobIn.StartTime = 1608923076 jobIn.JobID = 1403244 jobIn.Cluster = "emmy" diff --git a/pkg/archive/json.go b/pkg/archive/json.go index 5201b74..d3639f5 100644 --- a/pkg/archive/json.go +++ b/pkg/archive/json.go @@ -69,8 +69,8 @@ func DecodeJobStats(r io.Reader, k string) (schema.ScopedJobStats, error) { return nil, err } -func DecodeJobMeta(r io.Reader) (*schema.JobMeta, error) { - var d schema.JobMeta +func DecodeJobMeta(r io.Reader) (*schema.Job, error) { + var d schema.Job if err := json.NewDecoder(r).Decode(&d); err != nil { log.Warn("Error while decoding raw job meta json") return &d, err @@ -103,7 +103,7 @@ func EncodeJobData(w io.Writer, d *schema.JobData) error { return nil } -func EncodeJobMeta(w io.Writer, d *schema.JobMeta) error { +func EncodeJobMeta(w io.Writer, d *schema.Job) error { // Sanitize parameters if err := json.NewEncoder(w).Encode(d); err != nil { log.Warn("Error while encoding new job meta json") diff --git a/pkg/schema/job.go b/pkg/schema/job.go index 7475c36..ef1ecde 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -8,43 +8,8 @@ import ( "errors" "fmt" "io" - "time" ) -// BaseJob is the common part of the job metadata structs -// -// Common subset of Job and JobMeta. Use one of those, not this type directly. - -type BaseJob struct { - Cluster string `json:"cluster" db:"cluster" example:"fritz"` - SubCluster string `json:"subCluster" db:"subcluster" example:"main"` - Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"` - Project string `json:"project" db:"project" example:"abcd200"` - User string `json:"user" db:"hpc_user" example:"abcd100h"` - State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` - Tags []*Tag `json:"tags,omitempty"` - RawEnergyFootprint []byte `json:"-" db:"energy_footprint"` - RawFootprint []byte `json:"-" db:"footprint"` - RawMetaData []byte `json:"-" db:"meta_data"` - RawResources []byte `json:"-" db:"resources"` - Resources []*Resource `json:"resources"` - EnergyFootprint map[string]float64 `json:"energyFootprint"` - Footprint map[string]float64 `json:"footprint"` - MetaData map[string]string `json:"metaData"` - ConcurrentJobs JobLinkResultList `json:"concurrentJobs"` - Energy float64 `json:"energy" db:"energy"` - ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` - Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` - JobID int64 `json:"jobId" db:"job_id" example:"123000"` - Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` - SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` - MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` - Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` - NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` - NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` - NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` -} - // Job struct type // // This type is used as the GraphQL interface and using sqlx as a table row. @@ -52,10 +17,36 @@ type BaseJob struct { // Job model // @Description Information of a HPC job. type Job struct { - StartTime time.Time `json:"startTime"` - BaseJob - ID int64 `json:"id" db:"id"` - StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` + Cluster string `json:"cluster" db:"cluster" example:"fritz"` + SubCluster string `json:"subCluster" db:"subcluster" example:"main"` + Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"` + Project string `json:"project" db:"project" example:"abcd200"` + User string `json:"user" db:"hpc_user" example:"abcd100h"` + State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` + Tags []*Tag `json:"tags,omitempty"` + RawEnergyFootprint []byte `json:"-" db:"energy_footprint"` + RawFootprint []byte `json:"-" db:"footprint"` + RawMetaData []byte `json:"-" db:"meta_data"` + RawResources []byte `json:"-" db:"resources"` + Resources []*Resource `json:"resources"` + EnergyFootprint map[string]float64 `json:"energyFootprint"` + Footprint map[string]float64 `json:"footprint"` + MetaData map[string]string `json:"metaData"` + ConcurrentJobs JobLinkResultList `json:"concurrentJobs"` + Energy float64 `json:"energy" db:"energy"` + ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` + Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` + JobID int64 `json:"jobId" db:"job_id" example:"123000"` + Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` + SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` + MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` + Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` + NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` + NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` + NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` + Statistics map[string]JobStatistics `json:"statistics"` + ID *int64 `json:"id,omitempty" db:"id"` + StartTime int64 `json:"startTime" db:"start_time" example:"1649723812"` } // JobMeta struct type @@ -70,12 +61,12 @@ type Job struct { // // JobMeta model // @Description Meta data information of a HPC job. -type JobMeta struct { - ID *int64 `json:"id,omitempty"` - Statistics map[string]JobStatistics `json:"statistics"` - BaseJob - StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` -} +// type JobMeta struct { +// ID *int64 `json:"id,omitempty"` +// BaseJob +// Statistics map[string]JobStatistics `json:"statistics"` +// StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` +// } type JobLink struct { ID int64 `json:"id"` @@ -94,10 +85,10 @@ const ( MonitoringStatusArchivingSuccessful int32 = 3 ) -var JobDefaults BaseJob = BaseJob{ - Exclusive: 1, - MonitoringStatus: MonitoringStatusRunningOrArchiving, -} +// var JobDefaults Job = Job{ +// Exclusive: 1, +// MonitoringStatus: MonitoringStatusRunningOrArchiving, +// } type Unit struct { Base string `json:"base"` @@ -144,9 +135,9 @@ const ( JobStateOutOfMemory JobState = "out_of_memory" ) -func (j JobMeta) GoString() string { - return fmt.Sprintf("JobMeta{ID:%d, StartTime:%d, JobID:%v, BaseJob:%v}", - j.ID, j.StartTime, j.JobID, j.BaseJob) +func (j Job) GoString() string { + return fmt.Sprintf("Job{ID:%d, StartTime:%d, JobID:%v, BaseJob:%v}", + j.ID, j.StartTime, j.JobID, j) } func (e *JobState) UnmarshalGQL(v any) error { From 1bad6ba065775b01c67bd5805060c7943c484db1 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 28 May 2025 16:00:47 +0200 Subject: [PATCH 35/45] Regenerate GraphQL interface --- internal/graph/generated/generated.go | 95 ++++++++++++++++++++++++--- internal/graph/schema.resolvers.go | 19 +++--- 2 files changed, 96 insertions(+), 18 deletions(-) diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index e73bcf1..60e3ca0 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -398,6 +398,8 @@ type ClusterResolver interface { Partitions(ctx context.Context, obj *schema.Cluster) ([]string, error) } type JobResolver interface { + StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) + Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) @@ -5456,9 +5458,9 @@ func (ec *executionContext) _Job_id(ctx context.Context, field graphql.Collected } return graphql.Null } - res := resTmp.(int64) + res := resTmp.(*int64) fc.Result = res - return ec.marshalNID2int64(ctx, field.Selections, res) + return ec.marshalNID2ᚖint64(ctx, field.Selections, res) } func (ec *executionContext) fieldContext_Job_id(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { @@ -5708,7 +5710,7 @@ func (ec *executionContext) _Job_startTime(ctx context.Context, field graphql.Co }() resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { ctx = rctx // use context from middleware stack in children - return obj.StartTime, nil + return ec.resolvers.Job().StartTime(rctx, obj) }) if err != nil { ec.Error(ctx, err) @@ -5720,17 +5722,17 @@ func (ec *executionContext) _Job_startTime(ctx context.Context, field graphql.Co } return graphql.Null } - res := resTmp.(time.Time) + res := resTmp.(*time.Time) fc.Result = res - return ec.marshalNTime2timeᚐTime(ctx, field.Selections, res) + return ec.marshalNTime2ᚖtimeᚐTime(ctx, field.Selections, res) } func (ec *executionContext) fieldContext_Job_startTime(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "Job", Field: field, - IsMethod: false, - IsResolver: false, + IsMethod: true, + IsResolver: true, Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { return nil, errors.New("field of type Time does not have child fields") }, @@ -17424,10 +17426,41 @@ func (ec *executionContext) _Job(ctx context.Context, sel ast.SelectionSet, obj atomic.AddUint32(&out.Invalids, 1) } case "startTime": - out.Values[i] = ec._Job_startTime(ctx, field, obj) - if out.Values[i] == graphql.Null { - atomic.AddUint32(&out.Invalids, 1) + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Job_startTime(ctx, field, obj) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) case "duration": out.Values[i] = ec._Job_duration(ctx, field, obj) if out.Values[i] == graphql.Null { @@ -20580,6 +20613,27 @@ func (ec *executionContext) marshalNID2ᚕstringᚄ(ctx context.Context, sel ast return ret } +func (ec *executionContext) unmarshalNID2ᚖint64(ctx context.Context, v any) (*int64, error) { + res, err := graphql.UnmarshalInt64(v) + return &res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalNID2ᚖint64(ctx context.Context, sel ast.SelectionSet, v *int64) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + res := graphql.MarshalInt64(*v) + if res == graphql.Null { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + } + return res +} + func (ec *executionContext) unmarshalNInt2int(ctx context.Context, v any) (int, error) { res, err := graphql.UnmarshalInt(v) return res, graphql.ErrorOnPath(ctx, err) @@ -21799,6 +21853,27 @@ func (ec *executionContext) marshalNTime2timeᚐTime(ctx context.Context, sel as return res } +func (ec *executionContext) unmarshalNTime2ᚖtimeᚐTime(ctx context.Context, v any) (*time.Time, error) { + res, err := graphql.UnmarshalTime(v) + return &res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalNTime2ᚖtimeᚐTime(ctx context.Context, sel ast.SelectionSet, v *time.Time) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + res := graphql.MarshalTime(*v) + if res == graphql.Null { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + } + return res +} + func (ec *executionContext) marshalNTimeWeights2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐTimeWeights(ctx context.Context, sel ast.SelectionSet, v *model.TimeWeights) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index ad2b050..6b790a5 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -29,6 +29,11 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) ( return r.Repo.Partitions(obj.Name) } +// StartTime is the resolver for the startTime field. +func (r *jobResolver) StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) { + panic(fmt.Errorf("not implemented: StartTime - startTime")) +} + // Tags is the resolver for the tags field. func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) { return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID) @@ -776,11 +781,9 @@ func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} } // SubCluster returns generated.SubClusterResolver implementation. func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} } -type ( - clusterResolver struct{ *Resolver } - jobResolver struct{ *Resolver } - metricValueResolver struct{ *Resolver } - mutationResolver struct{ *Resolver } - queryResolver struct{ *Resolver } - subClusterResolver struct{ *Resolver } -) +type clusterResolver struct{ *Resolver } +type jobResolver struct{ *Resolver } +type metricValueResolver struct{ *Resolver } +type mutationResolver struct{ *Resolver } +type queryResolver struct{ *Resolver } +type subClusterResolver struct{ *Resolver } From 4dc0da5099ae17fd67219e8311cfe148d18a0fff Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 2 Jun 2025 12:07:01 +0200 Subject: [PATCH 36/45] Add node table schema --- internal/repository/migration.go | 2 +- .../migrations/sqlite3/10_node-table.down.sql | 1 + .../migrations/sqlite3/10_node-table.up.sql | 17 +++++++++ pkg/schema/node.go | 35 +++++++++++++++++++ 4 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 internal/repository/migrations/sqlite3/10_node-table.down.sql create mode 100644 internal/repository/migrations/sqlite3/10_node-table.up.sql create mode 100644 pkg/schema/node.go diff --git a/internal/repository/migration.go b/internal/repository/migration.go index c0693da..fb78170 100644 --- a/internal/repository/migration.go +++ b/internal/repository/migration.go @@ -16,7 +16,7 @@ import ( "github.com/golang-migrate/migrate/v4/source/iofs" ) -const Version uint = 9 +const Version uint = 10 //go:embed migrations/* var migrationFiles embed.FS diff --git a/internal/repository/migrations/sqlite3/10_node-table.down.sql b/internal/repository/migrations/sqlite3/10_node-table.down.sql new file mode 100644 index 0000000..9119a5a --- /dev/null +++ b/internal/repository/migrations/sqlite3/10_node-table.down.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS node; diff --git a/internal/repository/migrations/sqlite3/10_node-table.up.sql b/internal/repository/migrations/sqlite3/10_node-table.up.sql new file mode 100644 index 0000000..e4cb6c0 --- /dev/null +++ b/internal/repository/migrations/sqlite3/10_node-table.up.sql @@ -0,0 +1,17 @@ +CREATE TABLE "node" ( + id INTEGER PRIMARY KEY, + hostname VARCHAR(255) NOT NULL, + cluster VARCHAR(255) NOT NULL, + subcluster VARCHAR(255) NOT NULL, + node_state VARCHAR(255) NOT NULL + CHECK (job_state IN ( + 'allocated', 'reserved', 'idle', 'mixed', + 'down', 'unknown' + )), + health_state VARCHAR(255) NOT NULL + CHECK (job_state IN ( + 'full', 'partial', 'failed' + )), + meta_data TEXT, -- JSON + UNIQUE (hostname, cluster) +); diff --git a/pkg/schema/node.go b/pkg/schema/node.go new file mode 100644 index 0000000..3e2bbfb --- /dev/null +++ b/pkg/schema/node.go @@ -0,0 +1,35 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package schema + +type NodeState string + +const ( + NodeStateAllocated NodeState = "allocated" + NodeStateReserved NodeState = "reserved" + NodeStateIdle NodeState = "idle" + NodeStateMixed NodeState = "mixed" + NodeStateDown NodeState = "down" + NodeStateUnknown NodeState = "unknown" +) + +type MonitoringState string + +const ( + MonitoringStateFull MonitoringState = "full" + MonitoringStatePartial MonitoringState = "partial" + MonitoringStateFailed MonitoringState = "failed" +) + +type Node struct { + ID int64 `json:"id" db:"id"` + Hostname string `json:"hostname" db:"hostname" example:"fritz"` + Cluster string `json:"cluster" db:"cluster" example:"fritz"` + SubCluster string `json:"subCluster" db:"subcluster" example:"main"` + NodeState NodeState `json:"nodeState" db:"node_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` + HealthState MonitoringState `json:"healthState" db:"health_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` + RawMetaData []byte `json:"-" db:"meta_data"` + MetaData map[string]string `json:"metaData"` +} From 2c102cd1ff987ac9d7d811897e7c990b429756e7 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 3 Jun 2025 06:55:49 +0200 Subject: [PATCH 37/45] Fix error in node table migration --- internal/repository/migrations/sqlite3/10_node-table.up.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/repository/migrations/sqlite3/10_node-table.up.sql b/internal/repository/migrations/sqlite3/10_node-table.up.sql index e4cb6c0..a11f20d 100644 --- a/internal/repository/migrations/sqlite3/10_node-table.up.sql +++ b/internal/repository/migrations/sqlite3/10_node-table.up.sql @@ -4,12 +4,12 @@ CREATE TABLE "node" ( cluster VARCHAR(255) NOT NULL, subcluster VARCHAR(255) NOT NULL, node_state VARCHAR(255) NOT NULL - CHECK (job_state IN ( + CHECK (node_state IN ( 'allocated', 'reserved', 'idle', 'mixed', 'down', 'unknown' )), health_state VARCHAR(255) NOT NULL - CHECK (job_state IN ( + CHECK (health_state IN ( 'full', 'partial', 'failed' )), meta_data TEXT, -- JSON From 8b1b99ba357590e8d414905572650f7b27e7d4fa Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 3 Jun 2025 07:16:19 +0200 Subject: [PATCH 38/45] feat: Add requested memory to job meta data Fixes #110 --- pkg/schema/job.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/schema/job.go b/pkg/schema/job.go index ef1ecde..00051f4 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -36,6 +36,7 @@ type Job struct { Energy float64 `json:"energy" db:"energy"` ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` + RequestedMemory int64 `json:"requestedMemory,omitempty" db:"requested_memory" example:"128000" minimum:"1"` // in MB JobID int64 `json:"jobId" db:"job_id" example:"123000"` Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` From 6f9737c2c2bdf3bd7e62244b421466ed152acdf5 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 4 Jun 2025 13:44:37 +0200 Subject: [PATCH 39/45] Add node repository, extend GraphQL API Sync commit. --- api/schema.graphqls | 427 +++++++----- gqlgen.yml | 9 + internal/graph/generated/generated.go | 957 +++++++++++++++++++++----- internal/graph/schema.resolvers.go | 19 + internal/repository/node.go | 217 ++++++ 5 files changed, 1315 insertions(+), 314 deletions(-) create mode 100644 internal/repository/node.go diff --git a/api/schema.graphqls b/api/schema.graphqls index 268a579..6542464 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -4,61 +4,78 @@ scalar Any scalar NullableFloat scalar MetricScope scalar JobState +scalar NodeState +scalar MonitoringState + +type Node { + id: ID! + hostname: String! + cluster: String! + subCluster: String! + nodeState: NodeState! + HealthState: MonitoringState! + metaData: Any +} + +type NodeStats { + state: String! + count: Int! +} type Job { - id: ID! - jobId: Int! - user: String! - project: String! - cluster: String! - subCluster: String! - startTime: Time! - duration: Int! - walltime: Int! - numNodes: Int! - numHWThreads: Int! - numAcc: Int! - energy: Float! - SMT: Int! - exclusive: Int! - partition: String! - arrayJobId: Int! + id: ID! + jobId: Int! + user: String! + project: String! + cluster: String! + subCluster: String! + startTime: Time! + duration: Int! + walltime: Int! + numNodes: Int! + numHWThreads: Int! + numAcc: Int! + energy: Float! + SMT: Int! + exclusive: Int! + partition: String! + arrayJobId: Int! monitoringStatus: Int! - state: JobState! - tags: [Tag!]! - resources: [Resource!]! - concurrentJobs: JobLinkResultList - footprint: [FootprintValue] - energyFootprint: [EnergyFootprintValue] - metaData: Any - userData: User + state: JobState! + tags: [Tag!]! + resources: [Resource!]! + concurrentJobs: JobLinkResultList + footprint: [FootprintValue] + energyFootprint: [EnergyFootprintValue] + metaData: Any + userData: User } type JobLink { - id: ID! - jobId: Int! + id: ID! + jobId: Int! } type Cluster { - name: String! - partitions: [String!]! # Slurm partitions - subClusters: [SubCluster!]! # Hardware partitions/subclusters + name: String! + partitions: [String!]! # Slurm partitions + subClusters: [SubCluster!]! # Hardware partitions/subclusters } type SubCluster { - name: String! - nodes: String! - numberOfNodes: Int! - processorType: String! - socketsPerNode: Int! - coresPerSocket: Int! - threadsPerCore: Int! - flopRateScalar: MetricValue! - flopRateSimd: MetricValue! + name: String! + nodes: String! + numberOfNodes: Int! + processorType: String! + socketsPerNode: Int! + coresPerSocket: Int! + threadsPerCore: Int! + flopRateScalar: MetricValue! + flopRateSimd: MetricValue! memoryBandwidth: MetricValue! - topology: Topology! - metricConfig: [MetricConfig!]! - footprint: [String!]! + topology: Topology! + metricConfig: [MetricConfig!]! + footprint: [String!]! } type FootprintValue { @@ -80,94 +97,94 @@ type MetricValue { } type Topology { - node: [Int!] - socket: [[Int!]!] + node: [Int!] + socket: [[Int!]!] memoryDomain: [[Int!]!] - die: [[Int!]!] - core: [[Int!]!] + die: [[Int!]!] + core: [[Int!]!] accelerators: [Accelerator!] } type Accelerator { - id: String! - type: String! + id: String! + type: String! model: String! } type SubClusterConfig { - name: String! - peak: Float - normal: Float + name: String! + peak: Float + normal: Float caution: Float - alert: Float - remove: Boolean + alert: Float + remove: Boolean } type MetricConfig { - name: String! - unit: Unit! - scope: MetricScope! + name: String! + unit: Unit! + scope: MetricScope! aggregation: String! - timestep: Int! - peak: Float! - normal: Float + timestep: Int! + peak: Float! + normal: Float caution: Float! - alert: Float! + alert: Float! lowerIsBetter: Boolean subClusters: [SubClusterConfig!]! } type Tag { - id: ID! + id: ID! type: String! name: String! scope: String! } type Resource { - hostname: String! - hwthreads: [Int!] - accelerators: [String!] + hostname: String! + hwthreads: [Int!] + accelerators: [String!] configuration: String } type JobMetricWithName { - name: String! - scope: MetricScope! + name: String! + scope: MetricScope! metric: JobMetric! } type JobMetric { - unit: Unit - timestep: Int! - series: [Series!] + unit: Unit + timestep: Int! + series: [Series!] statisticsSeries: StatsSeries } type Series { - hostname: String! - id: String + hostname: String! + id: String statistics: MetricStatistics - data: [NullableFloat!]! + data: [NullableFloat!]! } type StatsSeries { - mean: [NullableFloat!]! + mean: [NullableFloat!]! median: [NullableFloat!]! - min: [NullableFloat!]! - max: [NullableFloat!]! + min: [NullableFloat!]! + max: [NullableFloat!]! } type NamedStatsWithScope { - name: String! - scope: MetricScope! - stats: [ScopedStats!]! + name: String! + scope: MetricScope! + stats: [ScopedStats!]! } type ScopedStats { - hostname: String! - id: String - data: MetricStatistics! + hostname: String! + id: String + data: MetricStatistics! } type JobStats { @@ -184,8 +201,8 @@ type JobStats { } type NamedStats { - name: String! - data: MetricStatistics! + name: String! + data: MetricStatistics! } type Unit { @@ -201,12 +218,12 @@ type MetricStatistics { type MetricFootprints { metric: String! - data: [NullableFloat!]! + data: [NullableFloat!]! } type Footprints { timeWeights: TimeWeights! - metrics: [MetricFootprints!]! + metrics: [MetricFootprints!]! } type TimeWeights { @@ -215,20 +232,33 @@ type TimeWeights { coreHours: [NullableFloat!]! } -enum Aggregate { USER, PROJECT, CLUSTER } -enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS } +enum Aggregate { + USER + PROJECT + CLUSTER +} +enum SortByAggregate { + TOTALWALLTIME + TOTALJOBS + TOTALNODES + TOTALNODEHOURS + TOTALCORES + TOTALCOREHOURS + TOTALACCS + TOTALACCHOURS +} type NodeMetrics { - host: String! + host: String! subCluster: String! - metrics: [JobMetricWithName!]! + metrics: [JobMetricWithName!]! } type NodesResultList { - items: [NodeMetrics!]! + items: [NodeMetrics!]! offset: Int - limit: Int - count: Int + limit: Int + count: Int totalNodes: Int hasNextPage: Boolean } @@ -247,14 +277,14 @@ type GlobalMetricListItem { } type Count { - name: String! + name: String! count: Int! } type User { username: String! - name: String! - email: String! + name: String! + email: String! } input MetricStatItem { @@ -263,27 +293,81 @@ input MetricStatItem { } type Query { - clusters: [Cluster!]! # List of all clusters - tags: [Tag!]! # List of all tags - globalMetrics: [GlobalMetricListItem!]! + clusters: [Cluster!]! # List of all clusters + tags: [Tag!]! # List of all tags + globalMetrics: [GlobalMetricListItem!]! user(username: String!): User allocatedNodes(cluster: String!): [Count!]! - job(id: ID!): Job - jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!], resolution: Int): [JobMetricWithName!]! - jobStats(id: ID!, metrics: [String!]): [NamedStats!]! - scopedJobStats(id: ID!, metrics: [String!], scopes: [MetricScope!]): [NamedStatsWithScope!]! + node(id: ID!): Node + nodes(filter: [NodeFilter!], order: OrderByInput): NodesResultList! + nodeStats(filter: [NodeFilter!]): [NodeStats!]! + + job(id: ID!): Job + jobMetrics( + id: ID! + metrics: [String!] + scopes: [MetricScope!] + resolution: Int + ): [JobMetricWithName!]! + + jobStats(id: ID!, metrics: [String!]): [NamedStats!]! + + scopedJobStats( + id: ID! + metrics: [String!] + scopes: [MetricScope!] + ): [NamedStatsWithScope!]! + + jobs( + filter: [JobFilter!] + page: PageRequest + order: OrderByInput + ): JobResultList! + + jobsStatistics( + filter: [JobFilter!] + metrics: [String!] + page: PageRequest + sortBy: SortByAggregate + groupBy: Aggregate + numDurationBins: String + numMetricBins: Int + ): [JobsStatistics!]! - jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList! - jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate, numDurationBins: String, numMetricBins: Int): [JobsStatistics!]! jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]! jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints - rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]! + rooflineHeatmap( + filter: [JobFilter!]! + rows: Int! + cols: Int! + minX: Float! + minY: Float! + maxX: Float! + maxY: Float! + ): [[Float!]!]! - nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]! - nodeMetricsList(cluster: String!, subCluster: String!, nodeFilter: String!, scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!, page: PageRequest, resolution: Int): NodesResultList! + nodeMetrics( + cluster: String! + nodes: [String!] + scopes: [MetricScope!] + metrics: [String!] + from: Time! + to: Time! + ): [NodeMetrics!]! + nodeMetricsList( + cluster: String! + subCluster: String! + nodeFilter: String! + scopes: [MetricScope!] + metrics: [String!] + from: Time! + to: Time! + page: PageRequest + resolution: Int + ): NodesResultList! } type Mutation { @@ -296,38 +380,53 @@ type Mutation { updateConfiguration(name: String!, value: String!): String } -type IntRangeOutput { from: Int!, to: Int! } -type TimeRangeOutput { range: String, from: Time!, to: Time! } +type IntRangeOutput { + from: Int! + to: Int! +} +type TimeRangeOutput { + range: String + from: Time! + to: Time! +} + +input NodeFilter { + hostname: StringInput + cluster: StringInput + subCluster: StringInput + nodeState: NodeState + healthState: MonitoringState +} input JobFilter { - tags: [ID!] - dbId: [ID!] - jobId: StringInput - arrayJobId: Int - user: StringInput - project: StringInput - jobName: StringInput - cluster: StringInput - partition: StringInput - duration: IntRange - energy: FloatRange + tags: [ID!] + dbId: [ID!] + jobId: StringInput + arrayJobId: Int + user: StringInput + project: StringInput + jobName: StringInput + cluster: StringInput + partition: StringInput + duration: IntRange + energy: FloatRange minRunningFor: Int - numNodes: IntRange + numNodes: IntRange numAccelerators: IntRange - numHWThreads: IntRange + numHWThreads: IntRange - startTime: TimeRange - state: [JobState!] + startTime: TimeRange + state: [JobState!] metricStats: [MetricStatItem!] - exclusive: Int - node: StringInput + exclusive: Int + node: StringInput } input OrderByInput { field: String! - type: String!, + type: String! order: SortDirectionEnum! = ASC } @@ -337,34 +436,46 @@ enum SortDirectionEnum { } input StringInput { - eq: String - neq: String - contains: String + eq: String + neq: String + contains: String startsWith: String - endsWith: String - in: [String!] + endsWith: String + in: [String!] } -input IntRange { from: Int!, to: Int! } -input TimeRange { range: String, from: Time, to: Time } +input IntRange { + from: Int! + to: Int! +} +input TimeRange { + range: String + from: Time + to: Time +} input FloatRange { from: Float! to: Float! } +type NodesResultList { + items: [Node!]! + count: Int +} + type JobResultList { - items: [Job!]! + items: [Job!]! offset: Int - limit: Int - count: Int + limit: Int + count: Int hasNextPage: Boolean } type JobLinkResultList { listQuery: String - items: [JobLink!]! - count: Int + items: [JobLink!]! + count: Int } type HistoPoint { @@ -386,27 +497,27 @@ type MetricHistoPoint { max: Int } -type JobsStatistics { - id: ID! # If `groupBy` was used, ID of the user/project/cluster - name: String! # if User-Statistics: Given Name of Account (ID) Owner - totalJobs: Int! # Number of jobs - runningJobs: Int! # Number of running jobs - shortJobs: Int! # Number of jobs with a duration of less than duration - totalWalltime: Int! # Sum of the duration of all matched jobs in hours - totalNodes: Int! # Sum of the nodes of all matched jobs - totalNodeHours: Int! # Sum of the node hours of all matched jobs - totalCores: Int! # Sum of the cores of all matched jobs - totalCoreHours: Int! # Sum of the core hours of all matched jobs - totalAccs: Int! # Sum of the accs of all matched jobs - totalAccHours: Int! # Sum of the gpu hours of all matched jobs - histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value - histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes - histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores - histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs - histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average +type JobsStatistics { + id: ID! # If `groupBy` was used, ID of the user/project/cluster + name: String! # if User-Statistics: Given Name of Account (ID) Owner + totalJobs: Int! # Number of jobs + runningJobs: Int! # Number of running jobs + shortJobs: Int! # Number of jobs with a duration of less than duration + totalWalltime: Int! # Sum of the duration of all matched jobs in hours + totalNodes: Int! # Sum of the nodes of all matched jobs + totalNodeHours: Int! # Sum of the node hours of all matched jobs + totalCores: Int! # Sum of the cores of all matched jobs + totalCoreHours: Int! # Sum of the core hours of all matched jobs + totalAccs: Int! # Sum of the accs of all matched jobs + totalAccHours: Int! # Sum of the gpu hours of all matched jobs + histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value + histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes + histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores + histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs + histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average } input PageRequest { itemsPerPage: Int! - page: Int! + page: Int! } diff --git a/gqlgen.yml b/gqlgen.yml index ccd95ff..307a074 100644 --- a/gqlgen.yml +++ b/gqlgen.yml @@ -62,6 +62,11 @@ models: fields: partitions: resolver: true + Node: + model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Node" + fields: + metaData: + resolver: true NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" } MetricScope: @@ -81,6 +86,10 @@ models: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" } JobState: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" } + MonitoringState: + { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.NodeState" } + HealthState: + { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MonitoringState" } TimeRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" } IntRange: diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index 60e3ca0..a1e9f92 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -44,6 +44,7 @@ type ResolverRoot interface { Job() JobResolver MetricValue() MetricValueResolver Mutation() MutationResolver + Node() NodeResolver Query() QueryResolver SubCluster() SubClusterResolver } @@ -268,6 +269,16 @@ type ComplexityRoot struct { Stats func(childComplexity int) int } + Node struct { + Cluster func(childComplexity int) int + HealthState func(childComplexity int) int + Hostname func(childComplexity int) int + ID func(childComplexity int) int + MetaData func(childComplexity int) int + NodeState func(childComplexity int) int + SubCluster func(childComplexity int) int + } + NodeMetrics struct { Host func(childComplexity int) int Metrics func(childComplexity int) int @@ -419,6 +430,11 @@ type MutationResolver interface { RemoveTagFromList(ctx context.Context, tagIds []string) ([]int, error) UpdateConfiguration(ctx context.Context, name string, value string) (*string, error) } +type NodeResolver interface { + NodeState(ctx context.Context, obj *schema.Node) (string, error) + HealthState(ctx context.Context, obj *schema.Node) (schema.NodeState, error) + MetaData(ctx context.Context, obj *schema.Node) (any, error) +} type QueryResolver interface { Clusters(ctx context.Context) ([]*schema.Cluster, error) Tags(ctx context.Context) ([]*schema.Tag, error) @@ -1435,6 +1451,55 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.NamedStatsWithScope.Stats(childComplexity), true + case "Node.cluster": + if e.complexity.Node.Cluster == nil { + break + } + + return e.complexity.Node.Cluster(childComplexity), true + + case "Node.HealthState": + if e.complexity.Node.HealthState == nil { + break + } + + return e.complexity.Node.HealthState(childComplexity), true + + case "Node.hostname": + if e.complexity.Node.Hostname == nil { + break + } + + return e.complexity.Node.Hostname(childComplexity), true + + case "Node.id": + if e.complexity.Node.ID == nil { + break + } + + return e.complexity.Node.ID(childComplexity), true + + case "Node.metaData": + if e.complexity.Node.MetaData == nil { + break + } + + return e.complexity.Node.MetaData(childComplexity), true + + case "Node.nodeState": + if e.complexity.Node.NodeState == nil { + break + } + + return e.complexity.Node.NodeState(childComplexity), true + + case "Node.subCluster": + if e.complexity.Node.SubCluster == nil { + break + } + + return e.complexity.Node.SubCluster(childComplexity), true + case "NodeMetrics.host": if e.complexity.NodeMetrics.Host == nil { break @@ -2179,61 +2244,73 @@ scalar Any scalar NullableFloat scalar MetricScope scalar JobState +scalar NodeState +scalar MonitoringState + +type Node { + id: ID! + hostname: String! + cluster: String! + subCluster: String! + nodeState: NodeState! + HealthState: MonitoringState! + metaData: Any +} type Job { - id: ID! - jobId: Int! - user: String! - project: String! - cluster: String! - subCluster: String! - startTime: Time! - duration: Int! - walltime: Int! - numNodes: Int! - numHWThreads: Int! - numAcc: Int! - energy: Float! - SMT: Int! - exclusive: Int! - partition: String! - arrayJobId: Int! + id: ID! + jobId: Int! + user: String! + project: String! + cluster: String! + subCluster: String! + startTime: Time! + duration: Int! + walltime: Int! + numNodes: Int! + numHWThreads: Int! + numAcc: Int! + energy: Float! + SMT: Int! + exclusive: Int! + partition: String! + arrayJobId: Int! monitoringStatus: Int! - state: JobState! - tags: [Tag!]! - resources: [Resource!]! - concurrentJobs: JobLinkResultList - footprint: [FootprintValue] - energyFootprint: [EnergyFootprintValue] - metaData: Any - userData: User + state: JobState! + tags: [Tag!]! + resources: [Resource!]! + concurrentJobs: JobLinkResultList + footprint: [FootprintValue] + energyFootprint: [EnergyFootprintValue] + metaData: Any + userData: User } type JobLink { - id: ID! - jobId: Int! + id: ID! + jobId: Int! } type Cluster { - name: String! - partitions: [String!]! # Slurm partitions - subClusters: [SubCluster!]! # Hardware partitions/subclusters + name: String! + partitions: [String!]! # Slurm partitions + subClusters: [SubCluster!]! # Hardware partitions/subclusters } type SubCluster { - name: String! - nodes: String! - numberOfNodes: Int! - processorType: String! - socketsPerNode: Int! - coresPerSocket: Int! - threadsPerCore: Int! - flopRateScalar: MetricValue! - flopRateSimd: MetricValue! + name: String! + nodes: String! + numberOfNodes: Int! + processorType: String! + socketsPerNode: Int! + coresPerSocket: Int! + threadsPerCore: Int! + flopRateScalar: MetricValue! + flopRateSimd: MetricValue! memoryBandwidth: MetricValue! - topology: Topology! - metricConfig: [MetricConfig!]! - footprint: [String!]! + topology: Topology! + metricConfig: [MetricConfig!]! + footprint: [String!]! } type FootprintValue { @@ -2255,94 +2332,94 @@ type MetricValue { } type Topology { - node: [Int!] - socket: [[Int!]!] + node: [Int!] + socket: [[Int!]!] memoryDomain: [[Int!]!] - die: [[Int!]!] - core: [[Int!]!] + die: [[Int!]!] + core: [[Int!]!] accelerators: [Accelerator!] } type Accelerator { - id: String! - type: String! + id: String! + type: String! model: String! } type SubClusterConfig { - name: String! - peak: Float - normal: Float + name: String! + peak: Float + normal: Float caution: Float - alert: Float - remove: Boolean + alert: Float + remove: Boolean } type MetricConfig { - name: String! - unit: Unit! - scope: MetricScope! + name: String! + unit: Unit! + scope: MetricScope! aggregation: String! - timestep: Int! - peak: Float! - normal: Float + timestep: Int! + peak: Float! + normal: Float caution: Float! - alert: Float! + alert: Float! lowerIsBetter: Boolean subClusters: [SubClusterConfig!]! } type Tag { - id: ID! + id: ID! type: String! name: String! scope: String! } type Resource { - hostname: String! - hwthreads: [Int!] - accelerators: [String!] + hostname: String! + hwthreads: [Int!] + accelerators: [String!] configuration: String } type JobMetricWithName { - name: String! - scope: MetricScope! + name: String! + scope: MetricScope! metric: JobMetric! } type JobMetric { - unit: Unit - timestep: Int! - series: [Series!] + unit: Unit + timestep: Int! + series: [Series!] statisticsSeries: StatsSeries } type Series { - hostname: String! - id: String + hostname: String! + id: String statistics: MetricStatistics - data: [NullableFloat!]! + data: [NullableFloat!]! } type StatsSeries { - mean: [NullableFloat!]! + mean: [NullableFloat!]! median: [NullableFloat!]! - min: [NullableFloat!]! - max: [NullableFloat!]! + min: [NullableFloat!]! + max: [NullableFloat!]! } type NamedStatsWithScope { - name: String! - scope: MetricScope! - stats: [ScopedStats!]! + name: String! + scope: MetricScope! + stats: [ScopedStats!]! } type ScopedStats { - hostname: String! - id: String - data: MetricStatistics! + hostname: String! + id: String + data: MetricStatistics! } type JobStats { @@ -2359,8 +2436,8 @@ type JobStats { } type NamedStats { - name: String! - data: MetricStatistics! + name: String! + data: MetricStatistics! } type Unit { @@ -2376,12 +2453,12 @@ type MetricStatistics { type MetricFootprints { metric: String! - data: [NullableFloat!]! + data: [NullableFloat!]! } type Footprints { timeWeights: TimeWeights! - metrics: [MetricFootprints!]! + metrics: [MetricFootprints!]! } type TimeWeights { @@ -2390,20 +2467,33 @@ type TimeWeights { coreHours: [NullableFloat!]! } -enum Aggregate { USER, PROJECT, CLUSTER } -enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS } +enum Aggregate { + USER + PROJECT + CLUSTER +} +enum SortByAggregate { + TOTALWALLTIME + TOTALJOBS + TOTALNODES + TOTALNODEHOURS + TOTALCORES + TOTALCOREHOURS + TOTALACCS + TOTALACCHOURS +} type NodeMetrics { - host: String! + host: String! subCluster: String! - metrics: [JobMetricWithName!]! + metrics: [JobMetricWithName!]! } type NodesResultList { - items: [NodeMetrics!]! + items: [NodeMetrics!]! offset: Int - limit: Int - count: Int + limit: Int + count: Int totalNodes: Int hasNextPage: Boolean } @@ -2422,14 +2512,14 @@ type GlobalMetricListItem { } type Count { - name: String! + name: String! count: Int! } type User { username: String! - name: String! - email: String! + name: String! + email: String! } input MetricStatItem { @@ -2438,27 +2528,73 @@ input MetricStatItem { } type Query { - clusters: [Cluster!]! # List of all clusters - tags: [Tag!]! # List of all tags - globalMetrics: [GlobalMetricListItem!]! + clusters: [Cluster!]! # List of all clusters + tags: [Tag!]! # List of all tags + globalMetrics: [GlobalMetricListItem!]! user(username: String!): User allocatedNodes(cluster: String!): [Count!]! job(id: ID!): Job - jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!], resolution: Int): [JobMetricWithName!]! + jobMetrics( + id: ID! + metrics: [String!] + scopes: [MetricScope!] + resolution: Int + ): [JobMetricWithName!]! jobStats(id: ID!, metrics: [String!]): [NamedStats!]! - scopedJobStats(id: ID!, metrics: [String!], scopes: [MetricScope!]): [NamedStatsWithScope!]! + scopedJobStats( + id: ID! + metrics: [String!] + scopes: [MetricScope!] + ): [NamedStatsWithScope!]! - jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList! - jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate, numDurationBins: String, numMetricBins: Int): [JobsStatistics!]! + jobs( + filter: [JobFilter!] + page: PageRequest + order: OrderByInput + ): JobResultList! + jobsStatistics( + filter: [JobFilter!] + metrics: [String!] + page: PageRequest + sortBy: SortByAggregate + groupBy: Aggregate + numDurationBins: String + numMetricBins: Int + ): [JobsStatistics!]! jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]! jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints - rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]! + rooflineHeatmap( + filter: [JobFilter!]! + rows: Int! + cols: Int! + minX: Float! + minY: Float! + maxX: Float! + maxY: Float! + ): [[Float!]!]! - nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]! - nodeMetricsList(cluster: String!, subCluster: String!, nodeFilter: String!, scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!, page: PageRequest, resolution: Int): NodesResultList! + nodeMetrics( + cluster: String! + nodes: [String!] + scopes: [MetricScope!] + metrics: [String!] + from: Time! + to: Time! + ): [NodeMetrics!]! + nodeMetricsList( + cluster: String! + subCluster: String! + nodeFilter: String! + scopes: [MetricScope!] + metrics: [String!] + from: Time! + to: Time! + page: PageRequest + resolution: Int + ): NodesResultList! } type Mutation { @@ -2471,38 +2607,45 @@ type Mutation { updateConfiguration(name: String!, value: String!): String } -type IntRangeOutput { from: Int!, to: Int! } -type TimeRangeOutput { range: String, from: Time!, to: Time! } +type IntRangeOutput { + from: Int! + to: Int! +} +type TimeRangeOutput { + range: String + from: Time! + to: Time! +} input JobFilter { - tags: [ID!] - dbId: [ID!] - jobId: StringInput - arrayJobId: Int - user: StringInput - project: StringInput - jobName: StringInput - cluster: StringInput - partition: StringInput - duration: IntRange - energy: FloatRange + tags: [ID!] + dbId: [ID!] + jobId: StringInput + arrayJobId: Int + user: StringInput + project: StringInput + jobName: StringInput + cluster: StringInput + partition: StringInput + duration: IntRange + energy: FloatRange minRunningFor: Int - numNodes: IntRange + numNodes: IntRange numAccelerators: IntRange - numHWThreads: IntRange + numHWThreads: IntRange - startTime: TimeRange - state: [JobState!] + startTime: TimeRange + state: [JobState!] metricStats: [MetricStatItem!] - exclusive: Int - node: StringInput + exclusive: Int + node: StringInput } input OrderByInput { field: String! - type: String!, + type: String! order: SortDirectionEnum! = ASC } @@ -2512,16 +2655,23 @@ enum SortDirectionEnum { } input StringInput { - eq: String - neq: String - contains: String + eq: String + neq: String + contains: String startsWith: String - endsWith: String - in: [String!] + endsWith: String + in: [String!] } -input IntRange { from: Int!, to: Int! } -input TimeRange { range: String, from: Time, to: Time } +input IntRange { + from: Int! + to: Int! +} +input TimeRange { + range: String + from: Time + to: Time +} input FloatRange { from: Float! @@ -2529,17 +2679,17 @@ input FloatRange { } type JobResultList { - items: [Job!]! + items: [Job!]! offset: Int - limit: Int - count: Int + limit: Int + count: Int hasNextPage: Boolean } type JobLinkResultList { listQuery: String - items: [JobLink!]! - count: Int + items: [JobLink!]! + count: Int } type HistoPoint { @@ -2561,29 +2711,29 @@ type MetricHistoPoint { max: Int } -type JobsStatistics { - id: ID! # If ` + "`" + `groupBy` + "`" + ` was used, ID of the user/project/cluster - name: String! # if User-Statistics: Given Name of Account (ID) Owner - totalJobs: Int! # Number of jobs - runningJobs: Int! # Number of running jobs - shortJobs: Int! # Number of jobs with a duration of less than duration - totalWalltime: Int! # Sum of the duration of all matched jobs in hours - totalNodes: Int! # Sum of the nodes of all matched jobs - totalNodeHours: Int! # Sum of the node hours of all matched jobs - totalCores: Int! # Sum of the cores of all matched jobs - totalCoreHours: Int! # Sum of the core hours of all matched jobs - totalAccs: Int! # Sum of the accs of all matched jobs - totalAccHours: Int! # Sum of the gpu hours of all matched jobs - histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value - histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes - histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores - histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs - histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average +type JobsStatistics { + id: ID! # If ` + "`" + `groupBy` + "`" + ` was used, ID of the user/project/cluster + name: String! # if User-Statistics: Given Name of Account (ID) Owner + totalJobs: Int! # Number of jobs + runningJobs: Int! # Number of running jobs + shortJobs: Int! # Number of jobs with a duration of less than duration + totalWalltime: Int! # Sum of the duration of all matched jobs in hours + totalNodes: Int! # Sum of the nodes of all matched jobs + totalNodeHours: Int! # Sum of the node hours of all matched jobs + totalCores: Int! # Sum of the cores of all matched jobs + totalCoreHours: Int! # Sum of the core hours of all matched jobs + totalAccs: Int! # Sum of the accs of all matched jobs + totalAccHours: Int! # Sum of the gpu hours of all matched jobs + histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value + histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes + histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores + histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs + histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average } input PageRequest { itemsPerPage: Int! - page: Int! + page: Int! } `, BuiltIn: false}, } @@ -10445,6 +10595,311 @@ func (ec *executionContext) fieldContext_NamedStatsWithScope_stats(_ context.Con return fc, nil } +func (ec *executionContext) _Node_id(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_id(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.ID, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(int64) + fc.Result = res + return ec.marshalNID2int64(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_id(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type ID does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_hostname(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_hostname(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Hostname, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNString2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_hostname(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_cluster(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_cluster(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Cluster, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNString2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_cluster(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_subCluster(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_subCluster(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.SubCluster, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNString2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_subCluster(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_nodeState(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_nodeState(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Node().NodeState(rctx, obj) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNNodeState2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_nodeState(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type NodeState does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_HealthState(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_HealthState(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Node().HealthState(rctx, obj) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(schema.NodeState) + fc.Result = res + return ec.marshalNMonitoringState2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_HealthState(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type MonitoringState does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_metaData(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_metaData(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Node().MetaData(rctx, obj) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(any) + fc.Result = res + return ec.marshalOAny2interface(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_metaData(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Any does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _NodeMetrics_host(ctx context.Context, field graphql.CollectedField, obj *model.NodeMetrics) (ret graphql.Marshaler) { fc, err := ec.fieldContext_NodeMetrics_host(ctx, field) if err != nil { @@ -18695,6 +19150,165 @@ func (ec *executionContext) _NamedStatsWithScope(ctx context.Context, sel ast.Se return out } +var nodeImplementors = []string{"Node"} + +func (ec *executionContext) _Node(ctx context.Context, sel ast.SelectionSet, obj *schema.Node) graphql.Marshaler { + fields := graphql.CollectFields(ec.OperationContext, sel, nodeImplementors) + + out := graphql.NewFieldSet(fields) + deferred := make(map[string]*graphql.FieldSet) + for i, field := range fields { + switch field.Name { + case "__typename": + out.Values[i] = graphql.MarshalString("Node") + case "id": + out.Values[i] = ec._Node_id(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) + } + case "hostname": + out.Values[i] = ec._Node_hostname(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) + } + case "cluster": + out.Values[i] = ec._Node_cluster(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) + } + case "subCluster": + out.Values[i] = ec._Node_subCluster(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) + } + case "nodeState": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Node_nodeState(ctx, field, obj) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + case "HealthState": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Node_HealthState(ctx, field, obj) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + case "metaData": + field := field + + innerFunc := func(ctx context.Context, _ *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Node_metaData(ctx, field, obj) + return res + } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + default: + panic("unknown field " + strconv.Quote(field.Name)) + } + } + out.Dispatch(ctx) + if out.Invalids > 0 { + return graphql.Null + } + + atomic.AddInt32(&ec.deferred, int32(len(deferred))) + + for label, dfs := range deferred { + ec.processDeferredGroup(graphql.DeferredGroup{ + Label: label, + Path: graphql.GetPath(ctx), + FieldSet: dfs, + Context: ctx, + }) + } + + return out +} + var nodeMetricsImplementors = []string{"NodeMetrics"} func (ec *executionContext) _NodeMetrics(ctx context.Context, sel ast.SelectionSet, obj *model.NodeMetrics) graphql.Marshaler { @@ -21285,6 +21899,22 @@ func (ec *executionContext) marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋ return ec._MetricValue(ctx, sel, &v) } +func (ec *executionContext) unmarshalNMonitoringState2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx context.Context, v any) (schema.NodeState, error) { + tmp, err := graphql.UnmarshalString(v) + res := schema.NodeState(tmp) + return res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalNMonitoringState2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx context.Context, sel ast.SelectionSet, v schema.NodeState) graphql.Marshaler { + res := graphql.MarshalString(string(v)) + if res == graphql.Null { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + } + return res +} + func (ec *executionContext) marshalNNamedStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStatsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NamedStats) graphql.Marshaler { ret := make(graphql.Array, len(v)) var wg sync.WaitGroup @@ -21447,6 +22077,21 @@ func (ec *executionContext) marshalNNodeMetrics2ᚖgithubᚗcomᚋClusterCockpit return ec._NodeMetrics(ctx, sel, v) } +func (ec *executionContext) unmarshalNNodeState2string(ctx context.Context, v any) (string, error) { + res, err := graphql.UnmarshalString(v) + return res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalNNodeState2string(ctx context.Context, sel ast.SelectionSet, v string) graphql.Marshaler { + res := graphql.MarshalString(v) + if res == graphql.Null { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + } + return res +} + func (ec *executionContext) marshalNNodesResultList2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodesResultList(ctx context.Context, sel ast.SelectionSet, v model.NodesResultList) graphql.Marshaler { return ec._NodesResultList(ctx, sel, &v) } diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 6b790a5..7d2331f 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -304,6 +304,21 @@ func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string, return nil, nil } +// NodeState is the resolver for the nodeState field. +func (r *nodeResolver) NodeState(ctx context.Context, obj *schema.Node) (string, error) { + panic(fmt.Errorf("not implemented: NodeState - nodeState")) +} + +// HealthState is the resolver for the HealthState field. +func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (schema.NodeState, error) { + panic(fmt.Errorf("not implemented: HealthState - HealthState")) +} + +// MetaData is the resolver for the metaData field. +func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) { + panic(fmt.Errorf("not implemented: MetaData - metaData")) +} + // Clusters is the resolver for the clusters field. func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error) { return archive.Clusters, nil @@ -775,6 +790,9 @@ func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricV // Mutation returns generated.MutationResolver implementation. func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} } +// Node returns generated.NodeResolver implementation. +func (r *Resolver) Node() generated.NodeResolver { return &nodeResolver{r} } + // Query returns generated.QueryResolver implementation. func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} } @@ -785,5 +803,6 @@ type clusterResolver struct{ *Resolver } type jobResolver struct{ *Resolver } type metricValueResolver struct{ *Resolver } type mutationResolver struct{ *Resolver } +type nodeResolver struct{ *Resolver } type queryResolver struct{ *Resolver } type subClusterResolver struct{ *Resolver } diff --git a/internal/repository/node.go b/internal/repository/node.go new file mode 100644 index 0000000..3713bbd --- /dev/null +++ b/internal/repository/node.go @@ -0,0 +1,217 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "encoding/json" + "fmt" + "maps" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/lrucache" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + sq "github.com/Masterminds/squirrel" + "github.com/jmoiron/sqlx" +) + +var ( + nodeRepoOnce sync.Once + nodeRepoInstance *NodeRepository +) + +type NodeRepository struct { + DB *sqlx.DB + stmtCache *sq.StmtCache + cache *lrucache.Cache + driver string +} + +func GetNodeRepository() *NodeRepository { + nodeRepoOnce.Do(func() { + db := GetConnection() + + nodeRepoInstance = &NodeRepository{ + DB: db.DB, + driver: db.Driver, + + stmtCache: sq.NewStmtCache(db.DB), + cache: lrucache.New(1024 * 1024), + } + }) + return nodeRepoInstance +} + +func (r *NodeRepository) FetchMetadata(node *schema.Node) (map[string]string, error) { + start := time.Now() + cachekey := fmt.Sprintf("metadata:%d", node.ID) + if cached := r.cache.Get(cachekey, nil); cached != nil { + node.MetaData = cached.(map[string]string) + return node.MetaData, nil + } + + if err := sq.Select("node.meta_data").From("node").Where("node.id = ?", node.ID). + RunWith(r.stmtCache).QueryRow().Scan(&node.RawMetaData); err != nil { + log.Warn("Error while scanning for node metadata") + return nil, err + } + + if len(node.RawMetaData) == 0 { + return nil, nil + } + + if err := json.Unmarshal(node.RawMetaData, &node.MetaData); err != nil { + log.Warn("Error while unmarshaling raw metadata json") + return nil, err + } + + r.cache.Put(cachekey, node.MetaData, len(node.RawMetaData), 24*time.Hour) + log.Debugf("Timer FetchMetadata %s", time.Since(start)) + return node.MetaData, nil +} + +func (r *NodeRepository) UpdateMetadata(node *schema.Node, key, val string) (err error) { + cachekey := fmt.Sprintf("metadata:%d", node.ID) + r.cache.Del(cachekey) + if node.MetaData == nil { + if _, err = r.FetchMetadata(node); err != nil { + log.Warnf("Error while fetching metadata for node, DB ID '%v'", node.ID) + return err + } + } + + if node.MetaData != nil { + cpy := make(map[string]string, len(node.MetaData)+1) + maps.Copy(cpy, node.MetaData) + cpy[key] = val + node.MetaData = cpy + } else { + node.MetaData = map[string]string{key: val} + } + + if node.RawMetaData, err = json.Marshal(node.MetaData); err != nil { + log.Warnf("Error while marshaling metadata for node, DB ID '%v'", node.ID) + return err + } + + if _, err = sq.Update("node"). + Set("meta_data", node.RawMetaData). + Where("node.id = ?", node.ID). + RunWith(r.stmtCache).Exec(); err != nil { + log.Warnf("Error while updating metadata for node, DB ID '%v'", node.ID) + return err + } + + r.cache.Put(cachekey, node.MetaData, len(node.RawMetaData), 24*time.Hour) + return nil +} + +func (r *NodeRepository) GetNode(id int64, withMeta bool) (*schema.Node, error) { + node := &schema.Node{} + if err := sq.Select("id", "hostname", "cluster", "subcluster", "node_state", + "health_state").From("node"). + Where("node.id = ?", id).RunWith(r.DB). + QueryRow().Scan(&node.ID, &node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, + &node.HealthState); err != nil { + log.Warnf("Error while querying node '%v' from database", id) + return nil, err + } + + if withMeta { + var err error + var meta map[string]string + if meta, err = r.FetchMetadata(node); err != nil { + log.Warnf("Error while fetching metadata for node '%v'", id) + return nil, err + } + node.MetaData = meta + } + + return node, nil +} + +const NamedNodeInsert string = ` +INSERT INTO node (hostname, cluster, subcluster, node_state, health_state, raw_meta_data) + VALUES (:hostname, :cluster, :subcluster, :node_state, :health_state, :raw_meta_data);` + +func (r *NodeRepository) AddNode(node *schema.Node) (int64, error) { + var err error + node.RawMetaData, err = json.Marshal(node.MetaData) + if err != nil { + log.Errorf("Error while marshaling metadata for node '%v'", node.Hostname) + return 0, err + } + + res, err := r.DB.NamedExec(NamedNodeInsert, node) + if err != nil { + log.Errorf("Error while adding node '%v' to database", node.Hostname) + return 0, err + } + node.ID, err = res.LastInsertId() + if err != nil { + log.Errorf("Error while getting last insert id for node '%v' from database", node.Hostname) + return 0, err + } + + return node.ID, nil +} + +func (r *NodeRepository) UpdateNodeState(id int64, nodeState *schema.NodeState) error { + if _, err := sq.Update("node").Set("node_state", nodeState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil { + log.Errorf("error while updating node '%d'", id) + return err + } + + return nil +} + +func (r *NodeRepository) UpdateHealthState(id int64, healthState *schema.MonitoringState) error { + if _, err := sq.Update("node").Set("health_state", healthState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil { + log.Errorf("error while updating node '%d'", id) + return err + } + + return nil +} + +func (r *NodeRepository) DeleteNode(id int64) error { + _, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id) + if err != nil { + log.Errorf("Error while deleting node '%d' from DB", id) + return err + } + log.Infof("deleted node '%d' from DB", id) + return nil +} + +func (r *NodeRepository) QueryNodes() ([]*schema.Node, error) { + return nil, nil +} + +func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { + q := sq.Select("hostname", "cluster", "subcluster", "node_state", + "health_state").From("node").Where("node.cluster = ?", cluster).OrderBy("node.hostname ASC") + + rows, err := q.RunWith(r.DB).Query() + if err != nil { + log.Warn("Error while querying user list") + return nil, err + } + nodeList := make([]*schema.Node, 0, 100) + defer rows.Close() + for rows.Next() { + node := &schema.Node{} + if err := rows.Scan(&node.Hostname, &node.Cluster, + &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + log.Warn("Error while scanning node list") + return nil, err + } + + nodeList = append(nodeList, node) + } + + return nodeList, nil +} From 7466fe7a347385dbb72b150513d6211941e47320 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 5 Jun 2025 13:17:24 +0200 Subject: [PATCH 40/45] Update GraphQL schema. Refactor node repository --- api/schema.graphqls | 4 +- internal/graph/generated/generated.go | 983 ++++++++++++++++++++++++++ internal/graph/model/models_gen.go | 18 + internal/graph/schema.resolvers.go | 15 + internal/repository/node.go | 8 +- 5 files changed, 1022 insertions(+), 6 deletions(-) diff --git a/api/schema.graphqls b/api/schema.graphqls index 6542464..a7bafde 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -301,7 +301,7 @@ type Query { allocatedNodes(cluster: String!): [Count!]! node(id: ID!): Node - nodes(filter: [NodeFilter!], order: OrderByInput): NodesResultList! + nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList! nodeStats(filter: [NodeFilter!]): [NodeStats!]! job(id: ID!): Job @@ -459,7 +459,7 @@ input FloatRange { to: Float! } -type NodesResultList { +type NodeStateResultList { items: [Node!]! count: Int } diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index a1e9f92..4f3b9fd 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -285,6 +285,16 @@ type ComplexityRoot struct { SubCluster func(childComplexity int) int } + NodeStateResultList struct { + Count func(childComplexity int) int + Items func(childComplexity int) int + } + + NodeStats struct { + Count func(childComplexity int) int + State func(childComplexity int) int + } + NodesResultList struct { Count func(childComplexity int) int HasNextPage func(childComplexity int) int @@ -305,8 +315,11 @@ type ComplexityRoot struct { JobsFootprints func(childComplexity int, filter []*model.JobFilter, metrics []string) int JobsMetricStats func(childComplexity int, filter []*model.JobFilter, metrics []string) int JobsStatistics func(childComplexity int, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate, numDurationBins *string, numMetricBins *int) int + Node func(childComplexity int, id string) int NodeMetrics func(childComplexity int, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) int NodeMetricsList func(childComplexity int, cluster string, subCluster string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) int + NodeStats func(childComplexity int, filter []*model.NodeFilter) int + Nodes func(childComplexity int, filter []*model.NodeFilter, order *model.OrderByInput) int RooflineHeatmap func(childComplexity int, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) int ScopedJobStats func(childComplexity int, id string, metrics []string, scopes []schema.MetricScope) int Tags func(childComplexity int) int @@ -441,6 +454,9 @@ type QueryResolver interface { GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) User(ctx context.Context, username string) (*model.User, error) AllocatedNodes(ctx context.Context, cluster string) ([]*model.Count, error) + Node(ctx context.Context, id string) (*schema.Node, error) + Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) + NodeStats(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStats, error) Job(ctx context.Context, id string) (*schema.Job, error) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope, resolution *int) ([]*model.JobMetricWithName, error) JobStats(ctx context.Context, id string, metrics []string) ([]*model.NamedStats, error) @@ -1521,6 +1537,34 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.NodeMetrics.SubCluster(childComplexity), true + case "NodeStateResultList.count": + if e.complexity.NodeStateResultList.Count == nil { + break + } + + return e.complexity.NodeStateResultList.Count(childComplexity), true + + case "NodeStateResultList.items": + if e.complexity.NodeStateResultList.Items == nil { + break + } + + return e.complexity.NodeStateResultList.Items(childComplexity), true + + case "NodeStats.count": + if e.complexity.NodeStats.Count == nil { + break + } + + return e.complexity.NodeStats.Count(childComplexity), true + + case "NodeStats.state": + if e.complexity.NodeStats.State == nil { + break + } + + return e.complexity.NodeStats.State(childComplexity), true + case "NodesResultList.count": if e.complexity.NodesResultList.Count == nil { break @@ -1673,6 +1717,18 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.Query.JobsStatistics(childComplexity, args["filter"].([]*model.JobFilter), args["metrics"].([]string), args["page"].(*model.PageRequest), args["sortBy"].(*model.SortByAggregate), args["groupBy"].(*model.Aggregate), args["numDurationBins"].(*string), args["numMetricBins"].(*int)), true + case "Query.node": + if e.complexity.Query.Node == nil { + break + } + + args, err := ec.field_Query_node_args(context.TODO(), rawArgs) + if err != nil { + return 0, false + } + + return e.complexity.Query.Node(childComplexity, args["id"].(string)), true + case "Query.nodeMetrics": if e.complexity.Query.NodeMetrics == nil { break @@ -1697,6 +1753,30 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.Query.NodeMetricsList(childComplexity, args["cluster"].(string), args["subCluster"].(string), args["nodeFilter"].(string), args["scopes"].([]schema.MetricScope), args["metrics"].([]string), args["from"].(time.Time), args["to"].(time.Time), args["page"].(*model.PageRequest), args["resolution"].(*int)), true + case "Query.nodeStats": + if e.complexity.Query.NodeStats == nil { + break + } + + args, err := ec.field_Query_nodeStats_args(context.TODO(), rawArgs) + if err != nil { + return 0, false + } + + return e.complexity.Query.NodeStats(childComplexity, args["filter"].([]*model.NodeFilter)), true + + case "Query.nodes": + if e.complexity.Query.Nodes == nil { + break + } + + args, err := ec.field_Query_nodes_args(context.TODO(), rawArgs) + if err != nil { + return 0, false + } + + return e.complexity.Query.Nodes(childComplexity, args["filter"].([]*model.NodeFilter), args["order"].(*model.OrderByInput)), true + case "Query.rooflineHeatmap": if e.complexity.Query.RooflineHeatmap == nil { break @@ -2137,6 +2217,7 @@ func (e *executableSchema) Exec(ctx context.Context) graphql.ResponseHandler { ec.unmarshalInputIntRange, ec.unmarshalInputJobFilter, ec.unmarshalInputMetricStatItem, + ec.unmarshalInputNodeFilter, ec.unmarshalInputOrderByInput, ec.unmarshalInputPageRequest, ec.unmarshalInputStringInput, @@ -2257,6 +2338,11 @@ type Node { metaData: Any } +type NodeStats { + state: String! + count: Int! +} + type Job { id: ID! jobId: Int! @@ -2535,6 +2621,10 @@ type Query { user(username: String!): User allocatedNodes(cluster: String!): [Count!]! + node(id: ID!): Node + nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList! + nodeStats(filter: [NodeFilter!]): [NodeStats!]! + job(id: ID!): Job jobMetrics( id: ID! @@ -2542,7 +2632,9 @@ type Query { scopes: [MetricScope!] resolution: Int ): [JobMetricWithName!]! + jobStats(id: ID!, metrics: [String!]): [NamedStats!]! + scopedJobStats( id: ID! metrics: [String!] @@ -2554,6 +2646,7 @@ type Query { page: PageRequest order: OrderByInput ): JobResultList! + jobsStatistics( filter: [JobFilter!] metrics: [String!] @@ -2563,6 +2656,7 @@ type Query { numDurationBins: String numMetricBins: Int ): [JobsStatistics!]! + jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]! jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints @@ -2617,6 +2711,14 @@ type TimeRangeOutput { to: Time! } +input NodeFilter { + hostname: StringInput + cluster: StringInput + subCluster: StringInput + nodeState: NodeState + healthState: MonitoringState +} + input JobFilter { tags: [ID!] dbId: [ID!] @@ -2678,6 +2780,11 @@ input FloatRange { to: Float! } +type NodeStateResultList { + items: [Node!]! + count: Int +} + type JobResultList { items: [Job!]! offset: Int @@ -3955,6 +4062,113 @@ func (ec *executionContext) field_Query_nodeMetrics_argsTo( return zeroVal, nil } +func (ec *executionContext) field_Query_nodeStats_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { + var err error + args := map[string]any{} + arg0, err := ec.field_Query_nodeStats_argsFilter(ctx, rawArgs) + if err != nil { + return nil, err + } + args["filter"] = arg0 + return args, nil +} +func (ec *executionContext) field_Query_nodeStats_argsFilter( + ctx context.Context, + rawArgs map[string]any, +) ([]*model.NodeFilter, error) { + if _, ok := rawArgs["filter"]; !ok { + var zeroVal []*model.NodeFilter + return zeroVal, nil + } + + ctx = graphql.WithPathContext(ctx, graphql.NewPathWithField("filter")) + if tmp, ok := rawArgs["filter"]; ok { + return ec.unmarshalONodeFilter2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilterᚄ(ctx, tmp) + } + + var zeroVal []*model.NodeFilter + return zeroVal, nil +} + +func (ec *executionContext) field_Query_node_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { + var err error + args := map[string]any{} + arg0, err := ec.field_Query_node_argsID(ctx, rawArgs) + if err != nil { + return nil, err + } + args["id"] = arg0 + return args, nil +} +func (ec *executionContext) field_Query_node_argsID( + ctx context.Context, + rawArgs map[string]any, +) (string, error) { + if _, ok := rawArgs["id"]; !ok { + var zeroVal string + return zeroVal, nil + } + + ctx = graphql.WithPathContext(ctx, graphql.NewPathWithField("id")) + if tmp, ok := rawArgs["id"]; ok { + return ec.unmarshalNID2string(ctx, tmp) + } + + var zeroVal string + return zeroVal, nil +} + +func (ec *executionContext) field_Query_nodes_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { + var err error + args := map[string]any{} + arg0, err := ec.field_Query_nodes_argsFilter(ctx, rawArgs) + if err != nil { + return nil, err + } + args["filter"] = arg0 + arg1, err := ec.field_Query_nodes_argsOrder(ctx, rawArgs) + if err != nil { + return nil, err + } + args["order"] = arg1 + return args, nil +} +func (ec *executionContext) field_Query_nodes_argsFilter( + ctx context.Context, + rawArgs map[string]any, +) ([]*model.NodeFilter, error) { + if _, ok := rawArgs["filter"]; !ok { + var zeroVal []*model.NodeFilter + return zeroVal, nil + } + + ctx = graphql.WithPathContext(ctx, graphql.NewPathWithField("filter")) + if tmp, ok := rawArgs["filter"]; ok { + return ec.unmarshalONodeFilter2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilterᚄ(ctx, tmp) + } + + var zeroVal []*model.NodeFilter + return zeroVal, nil +} + +func (ec *executionContext) field_Query_nodes_argsOrder( + ctx context.Context, + rawArgs map[string]any, +) (*model.OrderByInput, error) { + if _, ok := rawArgs["order"]; !ok { + var zeroVal *model.OrderByInput + return zeroVal, nil + } + + ctx = graphql.WithPathContext(ctx, graphql.NewPathWithField("order")) + if tmp, ok := rawArgs["order"]; ok { + return ec.unmarshalOOrderByInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐOrderByInput(ctx, tmp) + } + + var zeroVal *model.OrderByInput + return zeroVal, nil +} + func (ec *executionContext) field_Query_rooflineHeatmap_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { var err error args := map[string]any{} @@ -11040,6 +11254,195 @@ func (ec *executionContext) fieldContext_NodeMetrics_metrics(_ context.Context, return fc, nil } +func (ec *executionContext) _NodeStateResultList_items(ctx context.Context, field graphql.CollectedField, obj *model.NodeStateResultList) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_NodeStateResultList_items(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Items, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.([]*schema.Node) + fc.Result = res + return ec.marshalNNode2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeᚄ(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_NodeStateResultList_items(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "NodeStateResultList", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "id": + return ec.fieldContext_Node_id(ctx, field) + case "hostname": + return ec.fieldContext_Node_hostname(ctx, field) + case "cluster": + return ec.fieldContext_Node_cluster(ctx, field) + case "subCluster": + return ec.fieldContext_Node_subCluster(ctx, field) + case "nodeState": + return ec.fieldContext_Node_nodeState(ctx, field) + case "HealthState": + return ec.fieldContext_Node_HealthState(ctx, field) + case "metaData": + return ec.fieldContext_Node_metaData(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type Node", field.Name) + }, + } + return fc, nil +} + +func (ec *executionContext) _NodeStateResultList_count(ctx context.Context, field graphql.CollectedField, obj *model.NodeStateResultList) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_NodeStateResultList_count(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Count, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(*int) + fc.Result = res + return ec.marshalOInt2ᚖint(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_NodeStateResultList_count(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "NodeStateResultList", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Int does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _NodeStats_state(ctx context.Context, field graphql.CollectedField, obj *model.NodeStats) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_NodeStats_state(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.State, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNString2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_NodeStats_state(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "NodeStats", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _NodeStats_count(ctx context.Context, field graphql.CollectedField, obj *model.NodeStats) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_NodeStats_count(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Count, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(int) + fc.Result = res + return ec.marshalNInt2int(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_NodeStats_count(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "NodeStats", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Int does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _NodesResultList_items(ctx context.Context, field graphql.CollectedField, obj *model.NodesResultList) (ret graphql.Marshaler) { fc, err := ec.fieldContext_NodesResultList_items(ctx, field) if err != nil { @@ -11580,6 +11983,196 @@ func (ec *executionContext) fieldContext_Query_allocatedNodes(ctx context.Contex return fc, nil } +func (ec *executionContext) _Query_node(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Query_node(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Query().Node(rctx, fc.Args["id"].(string)) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(*schema.Node) + fc.Result = res + return ec.marshalONode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNode(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Query_node(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Query", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "id": + return ec.fieldContext_Node_id(ctx, field) + case "hostname": + return ec.fieldContext_Node_hostname(ctx, field) + case "cluster": + return ec.fieldContext_Node_cluster(ctx, field) + case "subCluster": + return ec.fieldContext_Node_subCluster(ctx, field) + case "nodeState": + return ec.fieldContext_Node_nodeState(ctx, field) + case "HealthState": + return ec.fieldContext_Node_HealthState(ctx, field) + case "metaData": + return ec.fieldContext_Node_metaData(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type Node", field.Name) + }, + } + defer func() { + if r := recover(); r != nil { + err = ec.Recover(ctx, r) + ec.Error(ctx, err) + } + }() + ctx = graphql.WithFieldContext(ctx, fc) + if fc.Args, err = ec.field_Query_node_args(ctx, field.ArgumentMap(ec.Variables)); err != nil { + ec.Error(ctx, err) + return fc, err + } + return fc, nil +} + +func (ec *executionContext) _Query_nodes(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Query_nodes(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Query().Nodes(rctx, fc.Args["filter"].([]*model.NodeFilter), fc.Args["order"].(*model.OrderByInput)) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(*model.NodeStateResultList) + fc.Result = res + return ec.marshalNNodeStateResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStateResultList(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Query_nodes(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Query", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "items": + return ec.fieldContext_NodeStateResultList_items(ctx, field) + case "count": + return ec.fieldContext_NodeStateResultList_count(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type NodeStateResultList", field.Name) + }, + } + defer func() { + if r := recover(); r != nil { + err = ec.Recover(ctx, r) + ec.Error(ctx, err) + } + }() + ctx = graphql.WithFieldContext(ctx, fc) + if fc.Args, err = ec.field_Query_nodes_args(ctx, field.ArgumentMap(ec.Variables)); err != nil { + ec.Error(ctx, err) + return fc, err + } + return fc, nil +} + +func (ec *executionContext) _Query_nodeStats(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Query_nodeStats(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Query().NodeStats(rctx, fc.Args["filter"].([]*model.NodeFilter)) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.([]*model.NodeStats) + fc.Result = res + return ec.marshalNNodeStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatsᚄ(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Query_nodeStats(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Query", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "state": + return ec.fieldContext_NodeStats_state(ctx, field) + case "count": + return ec.fieldContext_NodeStats_count(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type NodeStats", field.Name) + }, + } + defer func() { + if r := recover(); r != nil { + err = ec.Recover(ctx, r) + ec.Error(ctx, err) + } + }() + ctx = graphql.WithFieldContext(ctx, fc) + if fc.Args, err = ec.field_Query_nodeStats_args(ctx, field.ArgumentMap(ec.Variables)); err != nil { + ec.Error(ctx, err) + return fc, err + } + return fc, nil +} + func (ec *executionContext) _Query_job(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { fc, err := ec.fieldContext_Query_job(ctx, field) if err != nil { @@ -17146,6 +17739,61 @@ func (ec *executionContext) unmarshalInputMetricStatItem(ctx context.Context, ob return it, nil } +func (ec *executionContext) unmarshalInputNodeFilter(ctx context.Context, obj any) (model.NodeFilter, error) { + var it model.NodeFilter + asMap := map[string]any{} + for k, v := range obj.(map[string]any) { + asMap[k] = v + } + + fieldsInOrder := [...]string{"hostname", "cluster", "subCluster", "nodeState", "healthState"} + for _, k := range fieldsInOrder { + v, ok := asMap[k] + if !ok { + continue + } + switch k { + case "hostname": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("hostname")) + data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) + if err != nil { + return it, err + } + it.Hostname = data + case "cluster": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("cluster")) + data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) + if err != nil { + return it, err + } + it.Cluster = data + case "subCluster": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("subCluster")) + data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) + if err != nil { + return it, err + } + it.SubCluster = data + case "nodeState": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("nodeState")) + data, err := ec.unmarshalONodeState2ᚖstring(ctx, v) + if err != nil { + return it, err + } + it.NodeState = data + case "healthState": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("healthState")) + data, err := ec.unmarshalOMonitoringState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx, v) + if err != nil { + return it, err + } + it.HealthState = data + } + } + + return it, nil +} + func (ec *executionContext) unmarshalInputOrderByInput(ctx context.Context, obj any) (model.OrderByInput, error) { var it model.OrderByInput asMap := map[string]any{} @@ -19358,6 +20006,91 @@ func (ec *executionContext) _NodeMetrics(ctx context.Context, sel ast.SelectionS return out } +var nodeStateResultListImplementors = []string{"NodeStateResultList"} + +func (ec *executionContext) _NodeStateResultList(ctx context.Context, sel ast.SelectionSet, obj *model.NodeStateResultList) graphql.Marshaler { + fields := graphql.CollectFields(ec.OperationContext, sel, nodeStateResultListImplementors) + + out := graphql.NewFieldSet(fields) + deferred := make(map[string]*graphql.FieldSet) + for i, field := range fields { + switch field.Name { + case "__typename": + out.Values[i] = graphql.MarshalString("NodeStateResultList") + case "items": + out.Values[i] = ec._NodeStateResultList_items(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + case "count": + out.Values[i] = ec._NodeStateResultList_count(ctx, field, obj) + default: + panic("unknown field " + strconv.Quote(field.Name)) + } + } + out.Dispatch(ctx) + if out.Invalids > 0 { + return graphql.Null + } + + atomic.AddInt32(&ec.deferred, int32(len(deferred))) + + for label, dfs := range deferred { + ec.processDeferredGroup(graphql.DeferredGroup{ + Label: label, + Path: graphql.GetPath(ctx), + FieldSet: dfs, + Context: ctx, + }) + } + + return out +} + +var nodeStatsImplementors = []string{"NodeStats"} + +func (ec *executionContext) _NodeStats(ctx context.Context, sel ast.SelectionSet, obj *model.NodeStats) graphql.Marshaler { + fields := graphql.CollectFields(ec.OperationContext, sel, nodeStatsImplementors) + + out := graphql.NewFieldSet(fields) + deferred := make(map[string]*graphql.FieldSet) + for i, field := range fields { + switch field.Name { + case "__typename": + out.Values[i] = graphql.MarshalString("NodeStats") + case "state": + out.Values[i] = ec._NodeStats_state(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + case "count": + out.Values[i] = ec._NodeStats_count(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + default: + panic("unknown field " + strconv.Quote(field.Name)) + } + } + out.Dispatch(ctx) + if out.Invalids > 0 { + return graphql.Null + } + + atomic.AddInt32(&ec.deferred, int32(len(deferred))) + + for label, dfs := range deferred { + ec.processDeferredGroup(graphql.DeferredGroup{ + Label: label, + Path: graphql.GetPath(ctx), + FieldSet: dfs, + Context: ctx, + }) + } + + return out +} + var nodesResultListImplementors = []string{"NodesResultList"} func (ec *executionContext) _NodesResultList(ctx context.Context, sel ast.SelectionSet, obj *model.NodesResultList) graphql.Marshaler { @@ -19532,6 +20265,69 @@ func (ec *executionContext) _Query(ctx context.Context, sel ast.SelectionSet) gr func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) + case "node": + field := field + + innerFunc := func(ctx context.Context, _ *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Query_node(ctx, field) + return res + } + + rrm := func(ctx context.Context) graphql.Marshaler { + return ec.OperationContext.RootResolverMiddleware(ctx, + func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) + case "nodes": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Query_nodes(ctx, field) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + rrm := func(ctx context.Context) graphql.Marshaler { + return ec.OperationContext.RootResolverMiddleware(ctx, + func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) + case "nodeStats": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Query_nodeStats(ctx, field) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + rrm := func(ctx context.Context) graphql.Marshaler { + return ec.OperationContext.RootResolverMiddleware(ctx, + func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) case "job": field := field @@ -22023,6 +22819,65 @@ func (ec *executionContext) marshalNNamedStatsWithScope2ᚖgithubᚗcomᚋCluste return ec._NamedStatsWithScope(ctx, sel, v) } +func (ec *executionContext) marshalNNode2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Node) graphql.Marshaler { + ret := make(graphql.Array, len(v)) + var wg sync.WaitGroup + isLen1 := len(v) == 1 + if !isLen1 { + wg.Add(len(v)) + } + for i := range v { + i := i + fc := &graphql.FieldContext{ + Index: &i, + Result: &v[i], + } + ctx := graphql.WithFieldContext(ctx, fc) + f := func(i int) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = nil + } + }() + if !isLen1 { + defer wg.Done() + } + ret[i] = ec.marshalNNode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNode(ctx, sel, v[i]) + } + if isLen1 { + f(i) + } else { + go f(i) + } + + } + wg.Wait() + + for _, e := range ret { + if e == graphql.Null { + return graphql.Null + } + } + + return ret +} + +func (ec *executionContext) marshalNNode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNode(ctx context.Context, sel ast.SelectionSet, v *schema.Node) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._Node(ctx, sel, v) +} + +func (ec *executionContext) unmarshalNNodeFilter2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilter(ctx context.Context, v any) (*model.NodeFilter, error) { + res, err := ec.unmarshalInputNodeFilter(ctx, v) + return &res, graphql.ErrorOnPath(ctx, err) +} + func (ec *executionContext) marshalNNodeMetrics2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeMetricsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NodeMetrics) graphql.Marshaler { ret := make(graphql.Array, len(v)) var wg sync.WaitGroup @@ -22092,6 +22947,74 @@ func (ec *executionContext) marshalNNodeState2string(ctx context.Context, sel as return res } +func (ec *executionContext) marshalNNodeStateResultList2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStateResultList(ctx context.Context, sel ast.SelectionSet, v model.NodeStateResultList) graphql.Marshaler { + return ec._NodeStateResultList(ctx, sel, &v) +} + +func (ec *executionContext) marshalNNodeStateResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStateResultList(ctx context.Context, sel ast.SelectionSet, v *model.NodeStateResultList) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._NodeStateResultList(ctx, sel, v) +} + +func (ec *executionContext) marshalNNodeStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NodeStats) graphql.Marshaler { + ret := make(graphql.Array, len(v)) + var wg sync.WaitGroup + isLen1 := len(v) == 1 + if !isLen1 { + wg.Add(len(v)) + } + for i := range v { + i := i + fc := &graphql.FieldContext{ + Index: &i, + Result: &v[i], + } + ctx := graphql.WithFieldContext(ctx, fc) + f := func(i int) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = nil + } + }() + if !isLen1 { + defer wg.Done() + } + ret[i] = ec.marshalNNodeStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStats(ctx, sel, v[i]) + } + if isLen1 { + f(i) + } else { + go f(i) + } + + } + wg.Wait() + + for _, e := range ret { + if e == graphql.Null { + return graphql.Null + } + } + + return ret +} + +func (ec *executionContext) marshalNNodeStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStats(ctx context.Context, sel ast.SelectionSet, v *model.NodeStats) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._NodeStats(ctx, sel, v) +} + func (ec *executionContext) marshalNNodesResultList2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodesResultList(ctx context.Context, sel ast.SelectionSet, v model.NodesResultList) graphql.Marshaler { return ec._NodesResultList(ctx, sel, &v) } @@ -23373,6 +24296,66 @@ func (ec *executionContext) marshalOMetricStatistics2githubᚗcomᚋClusterCockp return ec._MetricStatistics(ctx, sel, &v) } +func (ec *executionContext) unmarshalOMonitoringState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx context.Context, v any) (*schema.NodeState, error) { + if v == nil { + return nil, nil + } + tmp, err := graphql.UnmarshalString(v) + res := schema.NodeState(tmp) + return &res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalOMonitoringState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx context.Context, sel ast.SelectionSet, v *schema.NodeState) graphql.Marshaler { + if v == nil { + return graphql.Null + } + res := graphql.MarshalString(string(*v)) + return res +} + +func (ec *executionContext) marshalONode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNode(ctx context.Context, sel ast.SelectionSet, v *schema.Node) graphql.Marshaler { + if v == nil { + return graphql.Null + } + return ec._Node(ctx, sel, v) +} + +func (ec *executionContext) unmarshalONodeFilter2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilterᚄ(ctx context.Context, v any) ([]*model.NodeFilter, error) { + if v == nil { + return nil, nil + } + var vSlice []any + if v != nil { + vSlice = graphql.CoerceList(v) + } + var err error + res := make([]*model.NodeFilter, len(vSlice)) + for i := range vSlice { + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithIndex(i)) + res[i], err = ec.unmarshalNNodeFilter2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilter(ctx, vSlice[i]) + if err != nil { + return nil, err + } + } + return res, nil +} + +func (ec *executionContext) unmarshalONodeState2ᚖstring(ctx context.Context, v any) (*string, error) { + if v == nil { + return nil, nil + } + res, err := graphql.UnmarshalString(v) + return &res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalONodeState2ᚖstring(ctx context.Context, sel ast.SelectionSet, v *string) graphql.Marshaler { + if v == nil { + return graphql.Null + } + res := graphql.MarshalString(*v) + return res +} + func (ec *executionContext) unmarshalOOrderByInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐOrderByInput(ctx context.Context, v any) (*model.OrderByInput, error) { if v == nil { return nil, nil diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index 5c50ff9..fc05280 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -167,12 +167,30 @@ type NamedStatsWithScope struct { Stats []*ScopedStats `json:"stats"` } +type NodeFilter struct { + Hostname *StringInput `json:"hostname,omitempty"` + Cluster *StringInput `json:"cluster,omitempty"` + SubCluster *StringInput `json:"subCluster,omitempty"` + NodeState *string `json:"nodeState,omitempty"` + HealthState *schema.NodeState `json:"healthState,omitempty"` +} + type NodeMetrics struct { Host string `json:"host"` SubCluster string `json:"subCluster"` Metrics []*JobMetricWithName `json:"metrics"` } +type NodeStateResultList struct { + Items []*schema.Node `json:"items"` + Count *int `json:"count,omitempty"` +} + +type NodeStats struct { + State string `json:"state"` + Count int `json:"count"` +} + type NodesResultList struct { Items []*NodeMetrics `json:"items"` Offset *int `json:"offset,omitempty"` diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 7d2331f..af167b4 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -358,6 +358,21 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]* return counts, nil } +// Node is the resolver for the node field. +func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) { + panic(fmt.Errorf("not implemented: Node - node")) +} + +// Nodes is the resolver for the nodes field. +func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) { + panic(fmt.Errorf("not implemented: Nodes - nodes")) +} + +// NodeStats is the resolver for the nodeStats field. +func (r *queryResolver) NodeStats(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStats, error) { + panic(fmt.Errorf("not implemented: NodeStats - nodeStats")) +} + // Job is the resolver for the job field. func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) { numericId, err := strconv.ParseInt(id, 10, 64) diff --git a/internal/repository/node.go b/internal/repository/node.go index 3713bbd..78323e6 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -159,16 +159,16 @@ func (r *NodeRepository) AddNode(node *schema.Node) (int64, error) { return node.ID, nil } -func (r *NodeRepository) UpdateNodeState(id int64, nodeState *schema.NodeState) error { - if _, err := sq.Update("node").Set("node_state", nodeState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil { - log.Errorf("error while updating node '%d'", id) +func (r *NodeRepository) UpdateNodeState(hostname string, nodeState *schema.NodeState) error { + if _, err := sq.Update("node").Set("node_state", nodeState).Where("node.hostname = ?", hostname).RunWith(r.DB).Exec(); err != nil { + log.Errorf("error while updating node '%s'", hostname) return err } return nil } -func (r *NodeRepository) UpdateHealthState(id int64, healthState *schema.MonitoringState) error { +func (r *NodeRepository) UpdateHealthState(hostname string, healthState *schema.MonitoringState) error { if _, err := sq.Update("node").Set("health_state", healthState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil { log.Errorf("error while updating node '%d'", id) return err From 1d8e7e072f485e38986c9062c75c3732d13853c3 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 5 Jun 2025 13:23:36 +0200 Subject: [PATCH 41/45] Refactor rest api --- internal/api/cluster.go | 70 +++ internal/api/job.go | 987 +++++++++++++++++++++++++++++++++ internal/api/node.go | 30 + internal/api/rest.go | 1172 +-------------------------------------- internal/api/user.go | 159 ++++++ 5 files changed, 1249 insertions(+), 1169 deletions(-) create mode 100644 internal/api/cluster.go create mode 100644 internal/api/job.go create mode 100644 internal/api/node.go create mode 100644 internal/api/user.go diff --git a/internal/api/cluster.go b/internal/api/cluster.go new file mode 100644 index 0000000..0529480 --- /dev/null +++ b/internal/api/cluster.go @@ -0,0 +1,70 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "bufio" + "encoding/json" + "fmt" + "net/http" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +// GetClustersApiResponse model +type GetClustersApiResponse struct { + Clusters []*schema.Cluster `json:"clusters"` // Array of clusters +} + +// getClusters godoc +// @summary Lists all cluster configs +// @tags Cluster query +// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter. +// @produce json +// @param cluster query string false "Job Cluster" +// @success 200 {object} api.GetClustersApiResponse "Array of clusters" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/clusters/ [get] +func (api *RestApi) getClusters(rw http.ResponseWriter, r *http.Request) { + if user := repository.GetUserFromContext(r.Context()); user != nil && + !user.HasRole(schema.RoleApi) { + + handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + bw := bufio.NewWriter(rw) + defer bw.Flush() + + var clusters []*schema.Cluster + + if r.URL.Query().Has("cluster") { + name := r.URL.Query().Get("cluster") + cluster := archive.GetCluster(name) + if cluster == nil { + handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw) + return + } + clusters = append(clusters, cluster) + } else { + clusters = archive.Clusters + } + + payload := GetClustersApiResponse{ + Clusters: clusters, + } + + if err := json.NewEncoder(bw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} diff --git a/internal/api/job.go b/internal/api/job.go new file mode 100644 index 0000000..1af6c38 --- /dev/null +++ b/internal/api/job.go @@ -0,0 +1,987 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "bufio" + "database/sql" + "encoding/json" + "errors" + "fmt" + "net/http" + "strconv" + "strings" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/archiver" + "github.com/ClusterCockpit/cc-backend/internal/graph" + "github.com/ClusterCockpit/cc-backend/internal/graph/model" + "github.com/ClusterCockpit/cc-backend/internal/importer" + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + "github.com/gorilla/mux" +) + +// DefaultApiResponse model +type DefaultJobApiResponse struct { + Message string `json:"msg"` +} + +// StopJobApiRequest model +type StopJobApiRequest struct { + JobId *int64 `json:"jobId" example:"123000"` + Cluster *string `json:"cluster" example:"fritz"` + StartTime *int64 `json:"startTime" example:"1649723812"` + State schema.JobState `json:"jobState" validate:"required" example:"completed"` + StopTime int64 `json:"stopTime" validate:"required" example:"1649763839"` +} + +// DeleteJobApiRequest model +type DeleteJobApiRequest struct { + JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job + Cluster *string `json:"cluster" example:"fritz"` // Cluster of job + StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch +} + +// GetJobsApiResponse model +type GetJobsApiResponse struct { + Jobs []*schema.Job `json:"jobs"` // Array of jobs + Items int `json:"items"` // Number of jobs returned + Page int `json:"page"` // Page id returned +} + +// ApiTag model +type ApiTag struct { + // Tag Type + Type string `json:"type" example:"Debug"` + Name string `json:"name" example:"Testjob"` // Tag Name + Scope string `json:"scope" example:"global"` // Tag Scope for Frontend Display +} + +// ApiMeta model +type EditMetaRequest struct { + Key string `json:"key" example:"jobScript"` + Value string `json:"value" example:"bash script"` +} + +type TagJobApiRequest []*ApiTag + +type GetJobApiRequest []string + +type GetJobApiResponse struct { + Meta *schema.Job + Data []*JobMetricWithName +} + +type GetCompleteJobApiResponse struct { + Meta *schema.Job + Data schema.JobData +} + +type JobMetricWithName struct { + Metric *schema.JobMetric `json:"metric"` + Name string `json:"name"` + Scope schema.MetricScope `json:"scope"` +} + +// getJobs godoc +// @summary Lists all jobs +// @tags Job query +// @description Get a list of all jobs. Filters can be applied using query parameters. +// @description Number of results can be limited by page. Results are sorted by descending startTime. +// @produce json +// @param state query string false "Job State" Enums(running, completed, failed, cancelled, stopped, timeout) +// @param cluster query string false "Job Cluster" +// @param start-time query string false "Syntax: '$from-$to', as unix epoch timestamps in seconds" +// @param items-per-page query int false "Items per page (Default: 25)" +// @param page query int false "Page Number (Default: 1)" +// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response" +// @success 200 {object} api.GetJobsApiResponse "Job array and page info" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/ [get] +func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) { + withMetadata := false + filter := &model.JobFilter{} + page := &model.PageRequest{ItemsPerPage: 25, Page: 1} + order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc} + + for key, vals := range r.URL.Query() { + switch key { + case "state": + for _, s := range vals { + state := schema.JobState(s) + if !state.Valid() { + handleError(fmt.Errorf("invalid query parameter value: state"), + http.StatusBadRequest, rw) + return + } + filter.State = append(filter.State, state) + } + case "cluster": + filter.Cluster = &model.StringInput{Eq: &vals[0]} + case "start-time": + st := strings.Split(vals[0], "-") + if len(st) != 2 { + handleError(fmt.Errorf("invalid query parameter value: startTime"), + http.StatusBadRequest, rw) + return + } + from, err := strconv.ParseInt(st[0], 10, 64) + if err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + to, err := strconv.ParseInt(st[1], 10, 64) + if err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + ufrom, uto := time.Unix(from, 0), time.Unix(to, 0) + filter.StartTime = &schema.TimeRange{From: &ufrom, To: &uto} + case "page": + x, err := strconv.Atoi(vals[0]) + if err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + page.Page = x + case "items-per-page": + x, err := strconv.Atoi(vals[0]) + if err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + page.ItemsPerPage = x + case "with-metadata": + withMetadata = true + default: + handleError(fmt.Errorf("invalid query parameter: %s", key), + http.StatusBadRequest, rw) + return + } + } + + jobs, err := api.JobRepository.QueryJobs(r.Context(), []*model.JobFilter{filter}, page, order) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + results := make([]*schema.Job, 0, len(jobs)) + for _, job := range jobs { + if withMetadata { + if _, err = api.JobRepository.FetchMetadata(job); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + if job.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful { + job.Statistics, err = archive.GetStatistics(job) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + } + + results = append(results, job) + } + + log.Debugf("/api/jobs: %d jobs returned", len(results)) + rw.Header().Add("Content-Type", "application/json") + bw := bufio.NewWriter(rw) + defer bw.Flush() + + payload := GetJobsApiResponse{ + Jobs: results, + Items: page.ItemsPerPage, + Page: page.Page, + } + + if err := json.NewEncoder(bw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} + +// getCompleteJobById godoc +// @summary Get job meta and optional all metric data +// @tags Job query +// @description Job to get is specified by database ID +// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'. +// @produce json +// @param id path int true "Database ID of Job" +// @param all-metrics query bool false "Include all available metrics" +// @success 200 {object} api.GetJobApiResponse "Job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/{id} [get] +func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request) { + // Fetch job from db + id, ok := mux.Vars(r)["id"] + var job *schema.Job + var err error + if ok { + id, e := strconv.ParseInt(id, 10, 64) + if e != nil { + handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) + return + } + + job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID + } else { + handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw) + return + } + if err != nil { + handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw) + return + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + + } + if _, err = api.JobRepository.FetchMetadata(job); err != nil { + + handleError(err, http.StatusInternalServerError, rw) + return + } + + var scopes []schema.MetricScope + + if job.NumNodes == 1 { + scopes = []schema.MetricScope{"core"} + } else { + scopes = []schema.MetricScope{"node"} + } + + var data schema.JobData + + metricConfigs := archive.GetCluster(job.Cluster).MetricConfig + resolution := 0 + + for _, mc := range metricConfigs { + resolution = max(resolution, mc.Timestep) + } + + if r.URL.Query().Get("all-metrics") == "true" { + data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution) + if err != nil { + log.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster) + return + } + } + + log.Debugf("/api/job/%s: get job %d", id, job.JobID) + rw.Header().Add("Content-Type", "application/json") + bw := bufio.NewWriter(rw) + defer bw.Flush() + + payload := GetCompleteJobApiResponse{ + Meta: job, + Data: data, + } + + if err := json.NewEncoder(bw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} + +// getJobById godoc +// @summary Get job meta and configurable metric data +// @tags Job query +// @description Job to get is specified by database ID +// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'. +// @accept json +// @produce json +// @param id path int true "Database ID of Job" +// @param request body api.GetJobApiRequest true "Array of metric names" +// @success 200 {object} api.GetJobApiResponse "Job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/{id} [post] +func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) { + // Fetch job from db + id, ok := mux.Vars(r)["id"] + var job *schema.Job + var err error + if ok { + id, e := strconv.ParseInt(id, 10, 64) + if e != nil { + handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) + return + } + + job, err = api.JobRepository.FindById(r.Context(), id) + } else { + handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) + return + } + if err != nil { + handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw) + return + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + + } + if _, err = api.JobRepository.FetchMetadata(job); err != nil { + + handleError(err, http.StatusInternalServerError, rw) + return + } + + var metrics GetJobApiRequest + if err = decode(r.Body, &metrics); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + var scopes []schema.MetricScope + + if job.NumNodes == 1 { + scopes = []schema.MetricScope{"core"} + } else { + scopes = []schema.MetricScope{"node"} + } + + metricConfigs := archive.GetCluster(job.Cluster).MetricConfig + resolution := 0 + + for _, mc := range metricConfigs { + resolution = max(resolution, mc.Timestep) + } + + data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution) + if err != nil { + log.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster) + return + } + + res := []*JobMetricWithName{} + for name, md := range data { + for scope, metric := range md { + res = append(res, &JobMetricWithName{ + Name: name, + Scope: scope, + Metric: metric, + }) + } + } + + log.Debugf("/api/job/%s: get job %d", id, job.JobID) + rw.Header().Add("Content-Type", "application/json") + bw := bufio.NewWriter(rw) + defer bw.Flush() + + payload := GetJobApiResponse{ + Meta: job, + Data: res, + } + + if err := json.NewEncoder(bw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} + +// editMeta godoc +// @summary Edit meta-data json +// @tags Job add and modify +// @description Edit key value pairs in job metadata json +// @description If a key already exists its content will be overwritten +// @accept json +// @produce json +// @param id path int true "Job Database ID" +// @param request body api.EditMetaRequest true "Kay value pair to add" +// @success 200 {object} schema.Job "Updated job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 404 {object} api.ErrorResponse "Job does not exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/edit_meta/{id} [post] +func (api *RestApi) editMeta(rw http.ResponseWriter, r *http.Request) { + id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + if err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + job, err := api.JobRepository.FindById(r.Context(), id) + if err != nil { + http.Error(rw, err.Error(), http.StatusNotFound) + return + } + + var req EditMetaRequest + if err := decode(r.Body, &req); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + if err := api.JobRepository.UpdateMetadata(job, req.Key, req.Value); err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(job) +} + +// tagJob godoc +// @summary Adds one or more tags to a job +// @tags Job add and modify +// @description Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely. +// @description Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username. +// @description If tagged job is already finished: Tag will be written directly to respective archive files. +// @accept json +// @produce json +// @param id path int true "Job Database ID" +// @param request body api.TagJobApiRequest true "Array of tag-objects to add" +// @success 200 {object} schema.Job "Updated job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/tag_job/{id} [post] +func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) { + id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + if err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + job, err := api.JobRepository.FindById(r.Context(), id) + if err != nil { + http.Error(rw, err.Error(), http.StatusNotFound) + return + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + var req TagJobApiRequest + if err := decode(r.Body, &req); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + for _, tag := range req { + tagId, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), *job.ID, tag.Type, tag.Name, tag.Scope) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + job.Tags = append(job.Tags, &schema.Tag{ + ID: tagId, + Type: tag.Type, + Name: tag.Name, + Scope: tag.Scope, + }) + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(job) +} + +// removeTagJob godoc +// @summary Removes one or more tags from a job +// @tags Job add and modify +// @description Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match. +// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API. +// @description If tagged job is already finished: Tag will be removed from respective archive files. +// @accept json +// @produce json +// @param id path int true "Job Database ID" +// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @success 200 {object} schema.Job "Updated job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /jobs/tag_job/{id} [delete] +func (api *RestApi) removeTagJob(rw http.ResponseWriter, r *http.Request) { + id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + if err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + job, err := api.JobRepository.FindById(r.Context(), id) + if err != nil { + http.Error(rw, err.Error(), http.StatusNotFound) + return + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + var req TagJobApiRequest + if err := decode(r.Body, &req); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + for _, rtag := range req { + // Only Global and Admin Tags + if rtag.Scope != "global" && rtag.Scope != "admin" { + log.Warnf("Cannot delete private tag for job %d: Skip", job.JobID) + continue + } + + remainingTags, err := api.JobRepository.RemoveJobTagByRequest(repository.GetUserFromContext(r.Context()), *job.ID, rtag.Type, rtag.Name, rtag.Scope) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + job.Tags = remainingTags + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(job) +} + +// removeTags godoc +// @summary Removes all tags and job-relations for type:name tuple +// @tags Tag remove +// @description Removes tags by type and name. Name and Type of Tag(s) must match. +// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API. +// @description Tag wills be removed from respective archive files. +// @accept json +// @produce plain +// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @success 200 {string} string "Success Response" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /tags/ [delete] +func (api *RestApi) removeTags(rw http.ResponseWriter, r *http.Request) { + var req TagJobApiRequest + if err := decode(r.Body, &req); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + targetCount := len(req) + currentCount := 0 + for _, rtag := range req { + // Only Global and Admin Tags + if rtag.Scope != "global" && rtag.Scope != "admin" { + log.Warn("Cannot delete private tag: Skip") + continue + } + + err := api.JobRepository.RemoveTagByRequest(rtag.Type, rtag.Name, rtag.Scope) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } else { + currentCount++ + } + } + + rw.WriteHeader(http.StatusOK) + fmt.Fprintf(rw, "Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount) +} + +// startJob godoc +// @summary Adds a new job as "running" +// @tags Job add and modify +// @description Job specified in request body will be saved to database as "running" with new DB ID. +// @description Job specifications follow the 'JobMeta' scheme, API will fail to execute if requirements are not met. +// @accept json +// @produce json +// @param request body schema.JobMeta true "Job to add" +// @success 201 {object} api.DefaultJobApiResponse "Job added successfully" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: The combination of jobId, clusterId and startTime does already exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/start_job/ [post] +func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { + req := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + if err := decode(r.Body, &req); err != nil { + handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) + return + } + + log.Printf("REST: %s\n", req.GoString()) + req.State = schema.JobStateRunning + + if err := importer.SanityChecks(&req); err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + + // aquire lock to avoid race condition between API calls + var unlockOnce sync.Once + api.RepositoryMutex.Lock() + defer unlockOnce.Do(api.RepositoryMutex.Unlock) + + // Check if combination of (job_id, cluster_id, start_time) already exists: + jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil) + if err != nil && err != sql.ErrNoRows { + handleError(fmt.Errorf("checking for duplicate failed: %w", err), http.StatusInternalServerError, rw) + return + } else if err == nil { + for _, job := range jobs { + if (req.StartTime - job.StartTime) < 86400 { + handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw) + return + } + } + } + + id, err := api.JobRepository.Start(&req) + if err != nil { + handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw) + return + } + // unlock here, adding Tags can be async + unlockOnce.Do(api.RepositoryMutex.Unlock) + + for _, tag := range req.Tags { + if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw) + return + } + } + + log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime) + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusCreated) + json.NewEncoder(rw).Encode(DefaultJobApiResponse{ + Message: "success", + }) +} + +// stopJobByRequest godoc +// @summary Marks job as completed and triggers archiving +// @tags Job add and modify +// @description Job to stop is specified by request body. All fields are required in this case. +// @description Returns full job resource information according to 'JobMeta' scheme. +// @produce json +// @param request body api.StopJobApiRequest true "All fields required" +// @success 200 {object} schema.JobMeta "Success message" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: job has already been stopped" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/stop_job/ [post] +func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { + // Parse request body + req := StopJobApiRequest{} + if err := decode(r.Body, &req); err != nil { + handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) + return + } + + // Fetch job (that will be stopped) from db + var job *schema.Job + var err error + if req.JobId == nil { + handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw) + return + } + + // log.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req) + job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) + if err != nil { + job, err = api.JobRepository.FindCached(req.JobId, req.Cluster, req.StartTime) + // FIXME: Previous error is hidden + if err != nil { + handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + } + + api.checkAndHandleStopJob(rw, job, req) +} + +// deleteJobById godoc +// @summary Remove a job from the sql database +// @tags Job remove +// @description Job to remove is specified by database ID. This will not remove the job from the job archive. +// @produce json +// @param id path int true "Database ID of Job" +// @success 200 {object} api.DefaultJobApiResponse "Success message" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/delete_job/{id} [delete] +func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) { + // Fetch job (that will be stopped) from db + id, ok := mux.Vars(r)["id"] + var err error + if ok { + id, e := strconv.ParseInt(id, 10, 64) + if e != nil { + handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) + return + } + + err = api.JobRepository.DeleteJobById(id) + } else { + handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) + return + } + if err != nil { + handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(DefaultJobApiResponse{ + Message: fmt.Sprintf("Successfully deleted job %s", id), + }) +} + +// deleteJobByRequest godoc +// @summary Remove a job from the sql database +// @tags Job remove +// @description Job to delete is specified by request body. All fields are required in this case. +// @accept json +// @produce json +// @param request body api.DeleteJobApiRequest true "All fields required" +// @success 200 {object} api.DefaultJobApiResponse "Success message" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/delete_job/ [delete] +func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) { + // Parse request body + req := DeleteJobApiRequest{} + if err := decode(r.Body, &req); err != nil { + handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) + return + } + + // Fetch job (that will be deleted) from db + var job *schema.Job + var err error + if req.JobId == nil { + handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw) + return + } + + job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) + if err != nil { + handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + + err = api.JobRepository.DeleteJobById(*job.ID) + if err != nil { + handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(DefaultJobApiResponse{ + Message: fmt.Sprintf("Successfully deleted job %d", job.ID), + }) +} + +// deleteJobBefore godoc +// @summary Remove a job from the sql database +// @tags Job remove +// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive. +// @produce json +// @param ts path int true "Unix epoch timestamp" +// @success 200 {object} api.DefaultJobApiResponse "Success message" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/delete_job_before/{ts} [delete] +func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { + var cnt int + // Fetch job (that will be stopped) from db + id, ok := mux.Vars(r)["ts"] + var err error + if ok { + ts, e := strconv.ParseInt(id, 10, 64) + if e != nil { + handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw) + return + } + + cnt, err = api.JobRepository.DeleteJobsBefore(ts) + } else { + handleError(errors.New("the parameter 'ts' is required"), http.StatusBadRequest, rw) + return + } + if err != nil { + handleError(fmt.Errorf("deleting jobs failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(DefaultJobApiResponse{ + Message: fmt.Sprintf("Successfully deleted %d jobs", cnt), + }) +} + +func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobApiRequest) { + // Sanity checks + if job.State != schema.JobStateRunning { + handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw) + return + } + + if job == nil || job.StartTime > req.StopTime { + handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw) + return + } + + if req.State != "" && !req.State.Valid() { + handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw) + return + } else if req.State == "" { + req.State = schema.JobStateCompleted + } + + // Mark job as stopped in the database (update state and duration) + job.Duration = int32(req.StopTime - job.StartTime) + job.State = req.State + api.JobRepository.Mutex.Lock() + if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + api.JobRepository.Mutex.Unlock() + handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + return + } + } + api.JobRepository.Mutex.Unlock() + + log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) + + // Send a response (with status OK). This means that erros that happen from here on forward + // can *NOT* be communicated to the client. If reading from a MetricDataRepository or + // writing to the filesystem fails, the client will not know. + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(job) + + // Monitoring is disabled... + if job.MonitoringStatus == schema.MonitoringStatusDisabled { + return + } + + // Trigger async archiving + archiver.TriggerArchiving(job) +} + +func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) { + id := mux.Vars(r)["id"] + metrics := r.URL.Query()["metric"] + var scopes []schema.MetricScope + for _, scope := range r.URL.Query()["scope"] { + var s schema.MetricScope + if err := s.UnmarshalGQL(scope); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + scopes = append(scopes, s) + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + + type Respone struct { + Data *struct { + JobMetrics []*model.JobMetricWithName `json:"jobMetrics"` + } `json:"data"` + Error *struct { + Message string `json:"message"` + } `json:"error"` + } + + resolver := graph.GetResolverInstance() + data, err := resolver.Query().JobMetrics(r.Context(), id, metrics, scopes, nil) + if err != nil { + json.NewEncoder(rw).Encode(Respone{ + Error: &struct { + Message string "json:\"message\"" + }{Message: err.Error()}, + }) + return + } + + json.NewEncoder(rw).Encode(Respone{ + Data: &struct { + JobMetrics []*model.JobMetricWithName "json:\"jobMetrics\"" + }{JobMetrics: data}, + }) +} diff --git a/internal/api/node.go b/internal/api/node.go new file mode 100644 index 0000000..ab34b16 --- /dev/null +++ b/internal/api/node.go @@ -0,0 +1,30 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "fmt" + "net/http" +) + +type Node struct { + Name string `json:"hostname"` + States []string `json:"states"` +} + +// updateNodeStatesRequest model +type UpdateNodeStatesRequest struct { + Nodes []Node `json:"nodes"` + Cluster string `json:"cluster" example:"fritz"` +} + +func (api *RestApi) updateNodeStates(rw http.ResponseWriter, r *http.Request) { + // Parse request body + req := UpdateNodeStatesRequest{} + if err := decode(r.Body, &req); err != nil { + handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) + return + } +} diff --git a/internal/api/rest.go b/internal/api/rest.go index 31a5979..54472d8 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -5,30 +5,18 @@ package api import ( - "bufio" - "database/sql" "encoding/json" - "errors" "fmt" "io" "net/http" "os" "path/filepath" - "strconv" - "strings" "sync" - "time" - "github.com/ClusterCockpit/cc-backend/internal/archiver" "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" - "github.com/ClusterCockpit/cc-backend/internal/graph" - "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/internal/importer" - "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/util" - "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/gorilla/mux" @@ -73,6 +61,8 @@ func (api *RestApi) MountApiRoutes(r *mux.Router) { r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet) // Cluster List r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet) + // Slurm node state + r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut) // Job Handler r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut) r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut) @@ -120,46 +110,13 @@ func (api *RestApi) MountConfigApiRoutes(r *mux.Router) { func (api *RestApi) MountFrontendApiRoutes(r *mux.Router) { r.StrictSlash(true) - // Settings Frontrend Uses SessionAuth + // Settings Frontend Uses SessionAuth if api.Authentication != nil { r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet) r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost) } } -// DefaultApiResponse model -type DefaultJobApiResponse struct { - Message string `json:"msg"` -} - -// StopJobApiRequest model -type StopJobApiRequest struct { - JobId *int64 `json:"jobId" example:"123000"` - Cluster *string `json:"cluster" example:"fritz"` - StartTime *int64 `json:"startTime" example:"1649723812"` - State schema.JobState `json:"jobState" validate:"required" example:"completed"` - StopTime int64 `json:"stopTime" validate:"required" example:"1649763839"` -} - -// DeleteJobApiRequest model -type DeleteJobApiRequest struct { - JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job - Cluster *string `json:"cluster" example:"fritz"` // Cluster of job - StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch -} - -// GetJobsApiResponse model -type GetJobsApiResponse struct { - Jobs []*schema.Job `json:"jobs"` // Array of jobs - Items int `json:"items"` // Number of jobs returned - Page int `json:"page"` // Page id returned -} - -// GetClustersApiResponse model -type GetClustersApiResponse struct { - Clusters []*schema.Cluster `json:"clusters"` // Array of clusters -} - // ErrorResponse model type ErrorResponse struct { // Statustext of Errorcode @@ -167,48 +124,6 @@ type ErrorResponse struct { Error string `json:"error"` // Error Message } -// ApiTag model -type ApiTag struct { - // Tag Type - Type string `json:"type" example:"Debug"` - Name string `json:"name" example:"Testjob"` // Tag Name - Scope string `json:"scope" example:"global"` // Tag Scope for Frontend Display -} - -// ApiMeta model -type EditMetaRequest struct { - Key string `json:"key" example:"jobScript"` - Value string `json:"value" example:"bash script"` -} - -type TagJobApiRequest []*ApiTag - -type GetJobApiRequest []string - -type GetJobApiResponse struct { - Meta *schema.Job - Data []*JobMetricWithName -} - -type GetCompleteJobApiResponse struct { - Meta *schema.Job - Data schema.JobData -} - -type JobMetricWithName struct { - Metric *schema.JobMetric `json:"metric"` - Name string `json:"name"` - Scope schema.MetricScope `json:"scope"` -} - -type ApiReturnedUser struct { - Username string `json:"username"` - Name string `json:"name"` - Roles []string `json:"roles"` - Email string `json:"email"` - Projects []string `json:"projects"` -} - func handleError(err error, statusCode int, rw http.ResponseWriter) { log.Warnf("REST ERROR : %s", err.Error()) rw.Header().Add("Content-Type", "application/json") @@ -225,1087 +140,6 @@ func decode(r io.Reader, val any) error { return dec.Decode(val) } -// getClusters godoc -// @summary Lists all cluster configs -// @tags Cluster query -// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter. -// @produce json -// @param cluster query string false "Job Cluster" -// @success 200 {object} api.GetClustersApiResponse "Array of clusters" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/clusters/ [get] -func (api *RestApi) getClusters(rw http.ResponseWriter, r *http.Request) { - if user := repository.GetUserFromContext(r.Context()); user != nil && - !user.HasRole(schema.RoleApi) { - - handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) - return - } - - rw.Header().Add("Content-Type", "application/json") - bw := bufio.NewWriter(rw) - defer bw.Flush() - - var clusters []*schema.Cluster - - if r.URL.Query().Has("cluster") { - name := r.URL.Query().Get("cluster") - cluster := archive.GetCluster(name) - if cluster == nil { - handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw) - return - } - clusters = append(clusters, cluster) - } else { - clusters = archive.Clusters - } - - payload := GetClustersApiResponse{ - Clusters: clusters, - } - - if err := json.NewEncoder(bw).Encode(payload); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } -} - -// getJobs godoc -// @summary Lists all jobs -// @tags Job query -// @description Get a list of all jobs. Filters can be applied using query parameters. -// @description Number of results can be limited by page. Results are sorted by descending startTime. -// @produce json -// @param state query string false "Job State" Enums(running, completed, failed, cancelled, stopped, timeout) -// @param cluster query string false "Job Cluster" -// @param start-time query string false "Syntax: '$from-$to', as unix epoch timestamps in seconds" -// @param items-per-page query int false "Items per page (Default: 25)" -// @param page query int false "Page Number (Default: 1)" -// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response" -// @success 200 {object} api.GetJobsApiResponse "Job array and page info" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/ [get] -func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) { - withMetadata := false - filter := &model.JobFilter{} - page := &model.PageRequest{ItemsPerPage: 25, Page: 1} - order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc} - - for key, vals := range r.URL.Query() { - switch key { - case "state": - for _, s := range vals { - state := schema.JobState(s) - if !state.Valid() { - handleError(fmt.Errorf("invalid query parameter value: state"), - http.StatusBadRequest, rw) - return - } - filter.State = append(filter.State, state) - } - case "cluster": - filter.Cluster = &model.StringInput{Eq: &vals[0]} - case "start-time": - st := strings.Split(vals[0], "-") - if len(st) != 2 { - handleError(fmt.Errorf("invalid query parameter value: startTime"), - http.StatusBadRequest, rw) - return - } - from, err := strconv.ParseInt(st[0], 10, 64) - if err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - to, err := strconv.ParseInt(st[1], 10, 64) - if err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - ufrom, uto := time.Unix(from, 0), time.Unix(to, 0) - filter.StartTime = &schema.TimeRange{From: &ufrom, To: &uto} - case "page": - x, err := strconv.Atoi(vals[0]) - if err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - page.Page = x - case "items-per-page": - x, err := strconv.Atoi(vals[0]) - if err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - page.ItemsPerPage = x - case "with-metadata": - withMetadata = true - default: - handleError(fmt.Errorf("invalid query parameter: %s", key), - http.StatusBadRequest, rw) - return - } - } - - jobs, err := api.JobRepository.QueryJobs(r.Context(), []*model.JobFilter{filter}, page, order) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } - - results := make([]*schema.Job, 0, len(jobs)) - for _, job := range jobs { - if withMetadata { - if _, err = api.JobRepository.FetchMetadata(job); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } - } - - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } - - if job.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful { - job.Statistics, err = archive.GetStatistics(job) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } - } - - results = append(results, job) - } - - log.Debugf("/api/jobs: %d jobs returned", len(results)) - rw.Header().Add("Content-Type", "application/json") - bw := bufio.NewWriter(rw) - defer bw.Flush() - - payload := GetJobsApiResponse{ - Jobs: results, - Items: page.ItemsPerPage, - Page: page.Page, - } - - if err := json.NewEncoder(bw).Encode(payload); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } -} - -// getCompleteJobById godoc -// @summary Get job meta and optional all metric data -// @tags Job query -// @description Job to get is specified by database ID -// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'. -// @produce json -// @param id path int true "Database ID of Job" -// @param all-metrics query bool false "Include all available metrics" -// @success 200 {object} api.GetJobApiResponse "Job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/{id} [get] -func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request) { - // Fetch job from db - id, ok := mux.Vars(r)["id"] - var job *schema.Job - var err error - if ok { - id, e := strconv.ParseInt(id, 10, 64) - if e != nil { - handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) - return - } - - job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID - } else { - handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw) - return - } - if err != nil { - handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw) - return - } - - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - - } - if _, err = api.JobRepository.FetchMetadata(job); err != nil { - - handleError(err, http.StatusInternalServerError, rw) - return - } - - var scopes []schema.MetricScope - - if job.NumNodes == 1 { - scopes = []schema.MetricScope{"core"} - } else { - scopes = []schema.MetricScope{"node"} - } - - var data schema.JobData - - metricConfigs := archive.GetCluster(job.Cluster).MetricConfig - resolution := 0 - - for _, mc := range metricConfigs { - resolution = max(resolution, mc.Timestep) - } - - if r.URL.Query().Get("all-metrics") == "true" { - data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution) - if err != nil { - log.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster) - return - } - } - - log.Debugf("/api/job/%s: get job %d", id, job.JobID) - rw.Header().Add("Content-Type", "application/json") - bw := bufio.NewWriter(rw) - defer bw.Flush() - - payload := GetCompleteJobApiResponse{ - Meta: job, - Data: data, - } - - if err := json.NewEncoder(bw).Encode(payload); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } -} - -// getJobById godoc -// @summary Get job meta and configurable metric data -// @tags Job query -// @description Job to get is specified by database ID -// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'. -// @accept json -// @produce json -// @param id path int true "Database ID of Job" -// @param request body api.GetJobApiRequest true "Array of metric names" -// @success 200 {object} api.GetJobApiResponse "Job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/{id} [post] -func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) { - // Fetch job from db - id, ok := mux.Vars(r)["id"] - var job *schema.Job - var err error - if ok { - id, e := strconv.ParseInt(id, 10, 64) - if e != nil { - handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) - return - } - - job, err = api.JobRepository.FindById(r.Context(), id) - } else { - handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) - return - } - if err != nil { - handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw) - return - } - - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - - } - if _, err = api.JobRepository.FetchMetadata(job); err != nil { - - handleError(err, http.StatusInternalServerError, rw) - return - } - - var metrics GetJobApiRequest - if err = decode(r.Body, &metrics); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - var scopes []schema.MetricScope - - if job.NumNodes == 1 { - scopes = []schema.MetricScope{"core"} - } else { - scopes = []schema.MetricScope{"node"} - } - - metricConfigs := archive.GetCluster(job.Cluster).MetricConfig - resolution := 0 - - for _, mc := range metricConfigs { - resolution = max(resolution, mc.Timestep) - } - - data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution) - if err != nil { - log.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster) - return - } - - res := []*JobMetricWithName{} - for name, md := range data { - for scope, metric := range md { - res = append(res, &JobMetricWithName{ - Name: name, - Scope: scope, - Metric: metric, - }) - } - } - - log.Debugf("/api/job/%s: get job %d", id, job.JobID) - rw.Header().Add("Content-Type", "application/json") - bw := bufio.NewWriter(rw) - defer bw.Flush() - - payload := GetJobApiResponse{ - Meta: job, - Data: res, - } - - if err := json.NewEncoder(bw).Encode(payload); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } -} - -// editMeta godoc -// @summary Edit meta-data json -// @tags Job add and modify -// @description Edit key value pairs in job metadata json -// @description If a key already exists its content will be overwritten -// @accept json -// @produce json -// @param id path int true "Job Database ID" -// @param request body api.EditMetaRequest true "Kay value pair to add" -// @success 200 {object} schema.Job "Updated job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 404 {object} api.ErrorResponse "Job does not exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/edit_meta/{id} [post] -func (api *RestApi) editMeta(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) - if err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - job, err := api.JobRepository.FindById(r.Context(), id) - if err != nil { - http.Error(rw, err.Error(), http.StatusNotFound) - return - } - - var req EditMetaRequest - if err := decode(r.Body, &req); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - if err := api.JobRepository.UpdateMetadata(job, req.Key, req.Value); err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(job) -} - -// tagJob godoc -// @summary Adds one or more tags to a job -// @tags Job add and modify -// @description Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely. -// @description Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username. -// @description If tagged job is already finished: Tag will be written directly to respective archive files. -// @accept json -// @produce json -// @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to add" -// @success 200 {object} schema.Job "Updated job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/tag_job/{id} [post] -func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) - if err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - job, err := api.JobRepository.FindById(r.Context(), id) - if err != nil { - http.Error(rw, err.Error(), http.StatusNotFound) - return - } - - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - var req TagJobApiRequest - if err := decode(r.Body, &req); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - for _, tag := range req { - tagId, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), *job.ID, tag.Type, tag.Name, tag.Scope) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - job.Tags = append(job.Tags, &schema.Tag{ - ID: tagId, - Type: tag.Type, - Name: tag.Name, - Scope: tag.Scope, - }) - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(job) -} - -// removeTagJob godoc -// @summary Removes one or more tags from a job -// @tags Job add and modify -// @description Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match. -// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API. -// @description If tagged job is already finished: Tag will be removed from respective archive files. -// @accept json -// @produce json -// @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" -// @success 200 {object} schema.Job "Updated job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /jobs/tag_job/{id} [delete] -func (api *RestApi) removeTagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) - if err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - job, err := api.JobRepository.FindById(r.Context(), id) - if err != nil { - http.Error(rw, err.Error(), http.StatusNotFound) - return - } - - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - var req TagJobApiRequest - if err := decode(r.Body, &req); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - for _, rtag := range req { - // Only Global and Admin Tags - if rtag.Scope != "global" && rtag.Scope != "admin" { - log.Warnf("Cannot delete private tag for job %d: Skip", job.JobID) - continue - } - - remainingTags, err := api.JobRepository.RemoveJobTagByRequest(repository.GetUserFromContext(r.Context()), *job.ID, rtag.Type, rtag.Name, rtag.Scope) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - job.Tags = remainingTags - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(job) -} - -// removeTags godoc -// @summary Removes all tags and job-relations for type:name tuple -// @tags Tag remove -// @description Removes tags by type and name. Name and Type of Tag(s) must match. -// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API. -// @description Tag wills be removed from respective archive files. -// @accept json -// @produce plain -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" -// @success 200 {string} string "Success Response" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /tags/ [delete] -func (api *RestApi) removeTags(rw http.ResponseWriter, r *http.Request) { - var req TagJobApiRequest - if err := decode(r.Body, &req); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - targetCount := len(req) - currentCount := 0 - for _, rtag := range req { - // Only Global and Admin Tags - if rtag.Scope != "global" && rtag.Scope != "admin" { - log.Warn("Cannot delete private tag: Skip") - continue - } - - err := api.JobRepository.RemoveTagByRequest(rtag.Type, rtag.Name, rtag.Scope) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } else { - currentCount++ - } - } - - rw.WriteHeader(http.StatusOK) - fmt.Fprintf(rw, "Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount) -} - -// startJob godoc -// @summary Adds a new job as "running" -// @tags Job add and modify -// @description Job specified in request body will be saved to database as "running" with new DB ID. -// @description Job specifications follow the 'JobMeta' scheme, API will fail to execute if requirements are not met. -// @accept json -// @produce json -// @param request body schema.JobMeta true "Job to add" -// @success 201 {object} api.DefaultJobApiResponse "Job added successfully" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: The combination of jobId, clusterId and startTime does already exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/start_job/ [post] -func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { - req := schema.Job{ - Exclusive: 1, - MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, - } - if err := decode(r.Body, &req); err != nil { - handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) - return - } - - log.Printf("REST: %s\n", req.GoString()) - req.State = schema.JobStateRunning - - if err := importer.SanityChecks(&req); err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - - // aquire lock to avoid race condition between API calls - var unlockOnce sync.Once - api.RepositoryMutex.Lock() - defer unlockOnce.Do(api.RepositoryMutex.Unlock) - - // Check if combination of (job_id, cluster_id, start_time) already exists: - jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil) - if err != nil && err != sql.ErrNoRows { - handleError(fmt.Errorf("checking for duplicate failed: %w", err), http.StatusInternalServerError, rw) - return - } else if err == nil { - for _, job := range jobs { - if (req.StartTime - job.StartTime) < 86400 { - handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw) - return - } - } - } - - id, err := api.JobRepository.Start(&req) - if err != nil { - handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw) - return - } - // unlock here, adding Tags can be async - unlockOnce.Do(api.RepositoryMutex.Unlock) - - for _, tag := range req.Tags { - if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw) - return - } - } - - log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime) - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusCreated) - json.NewEncoder(rw).Encode(DefaultJobApiResponse{ - Message: "success", - }) -} - -// stopJobByRequest godoc -// @summary Marks job as completed and triggers archiving -// @tags Job add and modify -// @description Job to stop is specified by request body. All fields are required in this case. -// @description Returns full job resource information according to 'JobMeta' scheme. -// @produce json -// @param request body api.StopJobApiRequest true "All fields required" -// @success 200 {object} schema.JobMeta "Success message" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: job has already been stopped" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/stop_job/ [post] -func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { - // Parse request body - req := StopJobApiRequest{} - if err := decode(r.Body, &req); err != nil { - handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) - return - } - - // Fetch job (that will be stopped) from db - var job *schema.Job - var err error - if req.JobId == nil { - handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw) - return - } - - // log.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req) - job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) - if err != nil { - job, err = api.JobRepository.FindCached(req.JobId, req.Cluster, req.StartTime) - // FIXME: Previous error is hidden - if err != nil { - handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - } - - api.checkAndHandleStopJob(rw, job, req) -} - -// deleteJobById godoc -// @summary Remove a job from the sql database -// @tags Job remove -// @description Job to remove is specified by database ID. This will not remove the job from the job archive. -// @produce json -// @param id path int true "Database ID of Job" -// @success 200 {object} api.DefaultJobApiResponse "Success message" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/delete_job/{id} [delete] -func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) { - // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["id"] - var err error - if ok { - id, e := strconv.ParseInt(id, 10, 64) - if e != nil { - handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) - return - } - - err = api.JobRepository.DeleteJobById(id) - } else { - handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) - return - } - if err != nil { - handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(DefaultJobApiResponse{ - Message: fmt.Sprintf("Successfully deleted job %s", id), - }) -} - -// deleteJobByRequest godoc -// @summary Remove a job from the sql database -// @tags Job remove -// @description Job to delete is specified by request body. All fields are required in this case. -// @accept json -// @produce json -// @param request body api.DeleteJobApiRequest true "All fields required" -// @success 200 {object} api.DefaultJobApiResponse "Success message" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/delete_job/ [delete] -func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) { - // Parse request body - req := DeleteJobApiRequest{} - if err := decode(r.Body, &req); err != nil { - handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) - return - } - - // Fetch job (that will be deleted) from db - var job *schema.Job - var err error - if req.JobId == nil { - handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw) - return - } - - job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) - if err != nil { - handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - - err = api.JobRepository.DeleteJobById(*job.ID) - if err != nil { - handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(DefaultJobApiResponse{ - Message: fmt.Sprintf("Successfully deleted job %d", job.ID), - }) -} - -// deleteJobBefore godoc -// @summary Remove a job from the sql database -// @tags Job remove -// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive. -// @produce json -// @param ts path int true "Unix epoch timestamp" -// @success 200 {object} api.DefaultJobApiResponse "Success message" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/delete_job_before/{ts} [delete] -func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { - var cnt int - // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["ts"] - var err error - if ok { - ts, e := strconv.ParseInt(id, 10, 64) - if e != nil { - handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw) - return - } - - cnt, err = api.JobRepository.DeleteJobsBefore(ts) - } else { - handleError(errors.New("the parameter 'ts' is required"), http.StatusBadRequest, rw) - return - } - if err != nil { - handleError(fmt.Errorf("deleting jobs failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(DefaultJobApiResponse{ - Message: fmt.Sprintf("Successfully deleted %d jobs", cnt), - }) -} - -func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobApiRequest) { - // Sanity checks - if job.State != schema.JobStateRunning { - handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw) - return - } - - if job == nil || job.StartTime > req.StopTime { - handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw) - return - } - - if req.State != "" && !req.State.Valid() { - handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw) - return - } else if req.State == "" { - req.State = schema.JobStateCompleted - } - - // Mark job as stopped in the database (update state and duration) - job.Duration = int32(req.StopTime - job.StartTime) - job.State = req.State - api.JobRepository.Mutex.Lock() - if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - api.JobRepository.Mutex.Unlock() - handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) - return - } - } - api.JobRepository.Mutex.Unlock() - - log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) - - // Send a response (with status OK). This means that erros that happen from here on forward - // can *NOT* be communicated to the client. If reading from a MetricDataRepository or - // writing to the filesystem fails, the client will not know. - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(job) - - // Monitoring is disabled... - if job.MonitoringStatus == schema.MonitoringStatusDisabled { - return - } - - // Trigger async archiving - archiver.TriggerArchiving(job) -} - -func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) { - id := mux.Vars(r)["id"] - metrics := r.URL.Query()["metric"] - var scopes []schema.MetricScope - for _, scope := range r.URL.Query()["scope"] { - var s schema.MetricScope - if err := s.UnmarshalGQL(scope); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - scopes = append(scopes, s) - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - - type Respone struct { - Data *struct { - JobMetrics []*model.JobMetricWithName `json:"jobMetrics"` - } `json:"data"` - Error *struct { - Message string `json:"message"` - } `json:"error"` - } - - resolver := graph.GetResolverInstance() - data, err := resolver.Query().JobMetrics(r.Context(), id, metrics, scopes, nil) - if err != nil { - json.NewEncoder(rw).Encode(Respone{ - Error: &struct { - Message string "json:\"message\"" - }{Message: err.Error()}, - }) - return - } - - json.NewEncoder(rw).Encode(Respone{ - Data: &struct { - JobMetrics []*model.JobMetricWithName "json:\"jobMetrics\"" - }{JobMetrics: data}, - }) -} - -func (api *RestApi) createUser(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed - - rw.Header().Set("Content-Type", "text/plain") - me := repository.GetUserFromContext(r.Context()) - if !me.HasRole(schema.RoleAdmin) { - http.Error(rw, "Only admins are allowed to create new users", http.StatusForbidden) - return - } - - username, password, role, name, email, project := r.FormValue("username"), - r.FormValue("password"), r.FormValue("role"), r.FormValue("name"), - r.FormValue("email"), r.FormValue("project") - - if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) { - http.Error(rw, "Only API users are allowed to have a blank password (login will be impossible)", http.StatusBadRequest) - return - } - - if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) { - http.Error(rw, "only managers require a project (can be changed later)", - http.StatusBadRequest) - return - } else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) { - http.Error(rw, "managers require a project to manage (can be changed later)", - http.StatusBadRequest) - return - } - - if err := repository.GetUserRepository().AddUser(&schema.User{ - Username: username, - Name: name, - Password: password, - Email: email, - Projects: []string{project}, - Roles: []string{role}, - }); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - - fmt.Fprintf(rw, "User %v successfully created!\n", username) -} - -func (api *RestApi) deleteUser(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed - - if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { - http.Error(rw, "Only admins are allowed to delete a user", http.StatusForbidden) - return - } - - username := r.FormValue("username") - if err := repository.GetUserRepository().DelUser(username); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - - rw.WriteHeader(http.StatusOK) -} - -// getUsers godoc -// @summary Returns a list of users -// @tags User -// @description Returns a JSON-encoded list of users. -// @description Required query-parameter defines if all users or only users with additional special roles are returned. -// @produce json -// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles" -// @success 200 {array} api.ApiReturnedUser "List of users returned successfully" -// @failure 400 {string} string "Bad Request" -// @failure 401 {string} string "Unauthorized" -// @failure 403 {string} string "Forbidden" -// @failure 500 {string} string "Internal Server Error" -// @security ApiKeyAuth -// @router /api/users/ [get] -func (api *RestApi) getUsers(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed - - if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { - http.Error(rw, "Only admins are allowed to fetch a list of users", http.StatusForbidden) - return - } - - users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true") - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - json.NewEncoder(rw).Encode(users) -} - -func (api *RestApi) updateUser(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed - - if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { - http.Error(rw, "Only admins are allowed to update a user", http.StatusForbidden) - return - } - - // Get Values - newrole := r.FormValue("add-role") - delrole := r.FormValue("remove-role") - newproj := r.FormValue("add-project") - delproj := r.FormValue("remove-project") - - // TODO: Handle anything but roles... - if newrole != "" { - if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - rw.Write([]byte("Add Role Success")) - } else if delrole != "" { - if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - rw.Write([]byte("Remove Role Success")) - } else if newproj != "" { - if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - rw.Write([]byte("Add Project Success")) - } else if delproj != "" { - if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - rw.Write([]byte("Remove Project Success")) - } else { - http.Error(rw, "Not Add or Del [role|project]?", http.StatusInternalServerError) - } -} - func (api *RestApi) editNotice(rw http.ResponseWriter, r *http.Request) { // SecuredCheck() only worked with TokenAuth: Removed diff --git a/internal/api/user.go b/internal/api/user.go new file mode 100644 index 0000000..3ba9c87 --- /dev/null +++ b/internal/api/user.go @@ -0,0 +1,159 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "encoding/json" + "fmt" + "net/http" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + "github.com/gorilla/mux" +) + +type ApiReturnedUser struct { + Username string `json:"username"` + Name string `json:"name"` + Roles []string `json:"roles"` + Email string `json:"email"` + Projects []string `json:"projects"` +} + +// getUsers godoc +// @summary Returns a list of users +// @tags User +// @description Returns a JSON-encoded list of users. +// @description Required query-parameter defines if all users or only users with additional special roles are returned. +// @produce json +// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles" +// @success 200 {array} api.ApiReturnedUser "List of users returned successfully" +// @failure 400 {string} string "Bad Request" +// @failure 401 {string} string "Unauthorized" +// @failure 403 {string} string "Forbidden" +// @failure 500 {string} string "Internal Server Error" +// @security ApiKeyAuth +// @router /api/users/ [get] +func (api *RestApi) getUsers(rw http.ResponseWriter, r *http.Request) { + // SecuredCheck() only worked with TokenAuth: Removed + + if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { + http.Error(rw, "Only admins are allowed to fetch a list of users", http.StatusForbidden) + return + } + + users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true") + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + json.NewEncoder(rw).Encode(users) +} + +func (api *RestApi) updateUser(rw http.ResponseWriter, r *http.Request) { + // SecuredCheck() only worked with TokenAuth: Removed + + if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { + http.Error(rw, "Only admins are allowed to update a user", http.StatusForbidden) + return + } + + // Get Values + newrole := r.FormValue("add-role") + delrole := r.FormValue("remove-role") + newproj := r.FormValue("add-project") + delproj := r.FormValue("remove-project") + + // TODO: Handle anything but roles... + if newrole != "" { + if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + rw.Write([]byte("Add Role Success")) + } else if delrole != "" { + if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + rw.Write([]byte("Remove Role Success")) + } else if newproj != "" { + if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + rw.Write([]byte("Add Project Success")) + } else if delproj != "" { + if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + rw.Write([]byte("Remove Project Success")) + } else { + http.Error(rw, "Not Add or Del [role|project]?", http.StatusInternalServerError) + } +} + +func (api *RestApi) createUser(rw http.ResponseWriter, r *http.Request) { + // SecuredCheck() only worked with TokenAuth: Removed + + rw.Header().Set("Content-Type", "text/plain") + me := repository.GetUserFromContext(r.Context()) + if !me.HasRole(schema.RoleAdmin) { + http.Error(rw, "Only admins are allowed to create new users", http.StatusForbidden) + return + } + + username, password, role, name, email, project := r.FormValue("username"), + r.FormValue("password"), r.FormValue("role"), r.FormValue("name"), + r.FormValue("email"), r.FormValue("project") + + if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) { + http.Error(rw, "Only API users are allowed to have a blank password (login will be impossible)", http.StatusBadRequest) + return + } + + if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) { + http.Error(rw, "only managers require a project (can be changed later)", + http.StatusBadRequest) + return + } else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) { + http.Error(rw, "managers require a project to manage (can be changed later)", + http.StatusBadRequest) + return + } + + if err := repository.GetUserRepository().AddUser(&schema.User{ + Username: username, + Name: name, + Password: password, + Email: email, + Projects: []string{project}, + Roles: []string{role}, + }); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + + fmt.Fprintf(rw, "User %v successfully created!\n", username) +} + +func (api *RestApi) deleteUser(rw http.ResponseWriter, r *http.Request) { + // SecuredCheck() only worked with TokenAuth: Removed + + if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { + http.Error(rw, "Only admins are allowed to delete a user", http.StatusForbidden) + return + } + + username := r.FormValue("username") + if err := repository.GetUserRepository().DelUser(username); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + + rw.WriteHeader(http.StatusOK) +} From 7db83d216ebff397684b4e5f92808c83b9181e4b Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 5 Jun 2025 14:27:21 +0200 Subject: [PATCH 42/45] Start implementing nodestate rest api --- internal/api/node.go | 11 ++++++ internal/repository/node.go | 58 ++++++++++++++++++++-------- internal/repository/testdata/job.db | Bin 118784 -> 118784 bytes 3 files changed, 52 insertions(+), 17 deletions(-) diff --git a/internal/api/node.go b/internal/api/node.go index ab34b16..2f7a319 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -7,6 +7,9 @@ package api import ( "fmt" "net/http" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/schema" ) type Node struct { @@ -27,4 +30,12 @@ func (api *RestApi) updateNodeStates(rw http.ResponseWriter, r *http.Request) { handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) return } + repo := repository.GetNodeRepository() + + for _, node := range req.Nodes { + state := schema.NodeStateUnknown + // TODO: Determine valid node state + repo.UpdateNodeState(node.Name, req.Cluster, &state) + + } } diff --git a/internal/repository/node.go b/internal/repository/node.go index 78323e6..0e742c2 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -5,12 +5,14 @@ package repository import ( + "database/sql" "encoding/json" "fmt" "maps" "sync" "time" + "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/lrucache" "github.com/ClusterCockpit/cc-backend/pkg/schema" @@ -134,16 +136,11 @@ func (r *NodeRepository) GetNode(id int64, withMeta bool) (*schema.Node, error) } const NamedNodeInsert string = ` -INSERT INTO node (hostname, cluster, subcluster, node_state, health_state, raw_meta_data) - VALUES (:hostname, :cluster, :subcluster, :node_state, :health_state, :raw_meta_data);` +INSERT INTO node (hostname, cluster, subcluster, node_state, health_state) + VALUES (:hostname, :cluster, :subcluster, :node_state, :health_state);` func (r *NodeRepository) AddNode(node *schema.Node) (int64, error) { var err error - node.RawMetaData, err = json.Marshal(node.MetaData) - if err != nil { - log.Errorf("Error while marshaling metadata for node '%v'", node.Hostname) - return 0, err - } res, err := r.DB.NamedExec(NamedNodeInsert, node) if err != nil { @@ -159,8 +156,35 @@ func (r *NodeRepository) AddNode(node *schema.Node) (int64, error) { return node.ID, nil } -func (r *NodeRepository) UpdateNodeState(hostname string, nodeState *schema.NodeState) error { - if _, err := sq.Update("node").Set("node_state", nodeState).Where("node.hostname = ?", hostname).RunWith(r.DB).Exec(); err != nil { +func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeState *schema.NodeState) error { + var id int64 + if err := sq.Select("id").From("node"). + Where("node.hostname = ?", hostname).Where("node.cluster = ?", cluster).RunWith(r.DB). + QueryRow().Scan(&id); err != nil { + if err == sql.ErrNoRows { + subcluster, err := archive.GetSubClusterByNode(cluster, hostname) + if err != nil { + log.Errorf("Error while getting subcluster for node '%s' in cluster '%s': %v", hostname, cluster, err) + return err + } + node := schema.Node{ + Hostname: hostname, Cluster: cluster, SubCluster: subcluster, NodeState: *nodeState, + HealthState: schema.MonitoringStateFull, + } + _, err = r.AddNode(&node) + if err != nil { + log.Errorf("Error while adding node '%s' to database: %v", hostname, err) + return err + } + + return nil + } else { + log.Warnf("Error while querying node '%v' from database", id) + return err + } + } + + if _, err := sq.Update("node").Set("node_state", nodeState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil { log.Errorf("error while updating node '%s'", hostname) return err } @@ -168,14 +192,14 @@ func (r *NodeRepository) UpdateNodeState(hostname string, nodeState *schema.Node return nil } -func (r *NodeRepository) UpdateHealthState(hostname string, healthState *schema.MonitoringState) error { - if _, err := sq.Update("node").Set("health_state", healthState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil { - log.Errorf("error while updating node '%d'", id) - return err - } - - return nil -} +// func (r *NodeRepository) UpdateHealthState(hostname string, healthState *schema.MonitoringState) error { +// if _, err := sq.Update("node").Set("health_state", healthState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil { +// log.Errorf("error while updating node '%d'", id) +// return err +// } +// +// return nil +// } func (r *NodeRepository) DeleteNode(id int64) error { _, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id) diff --git a/internal/repository/testdata/job.db b/internal/repository/testdata/job.db index c65dfd093bbf6f8e9cbedd58c8a4e9c19cc662ed..e9e20cebc366222e58e1ba375f78b4b57a3dd444 100644 GIT binary patch delta 521 zcmZozz}~QceS);0Bm)D3EEKZ=X`6{U#=?>edR2w2KprFiZy?FQ&;NU4Q71o0gjaBr zz<+*5u1yMH(tu?GGdp|wX7&R=`6mgmY-Ta|%P+~Xoq?0BkAeRt-xa=@ysf-(9G^J$ zv-NTGZ59+LVbg4`Rc03#6=iIXElEtuNzKbo0b@SrAXmo_SA`HqCm&Y@C7|%+i3e0U zGV+T{@)C1XCoeo9F?qs4?#VX~IB|jW#21$&mVkuC>bVqvz}dsq*;_#aA*JByr=S6r z0!gbU=H%oj19haR>nNxfr52|al|k5utAlLIofAhD<AkgIX5Z%2ayIW6PWoJ)Hbsp_{lHA1yTys!2g5)J^wTQyZl%9Pj40!Si~Ran{8-d zmYQT}zy?&|0OfEnvoIz<75TviW3a*)EMUgtTkOmb#?4Pm?942j$&YVNhuiU%{}%so z{%!nA_$LAFsN Date: Thu, 5 Jun 2025 14:27:26 +0200 Subject: [PATCH 43/45] Refactor --- pkg/schema/user.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pkg/schema/user.go b/pkg/schema/user.go index 9b62cfa..2fff453 100644 --- a/pkg/schema/user.go +++ b/pkg/schema/user.go @@ -6,6 +6,7 @@ package schema import ( "fmt" + "slices" "strings" ) @@ -50,12 +51,7 @@ type User struct { } func (u *User) HasProject(project string) bool { - for _, p := range u.Projects { - if p == project { - return true - } - } - return false + return slices.Contains(u.Projects, project) } func GetRoleString(roleInt Role) string { From 1d9aa759607eccf723c34c148f17e7cab3abd05e Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 5 Jun 2025 16:15:40 +0200 Subject: [PATCH 44/45] Add determine nodestate routine --- internal/api/node.go | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/internal/api/node.go b/internal/api/node.go index 2f7a319..62c1e2a 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -23,6 +23,40 @@ type UpdateNodeStatesRequest struct { Cluster string `json:"cluster" example:"fritz"` } +// this routine assumes that only one of them applies per node +func determineState(states []string) schema.NodeState { + for _, state := range states { + switch state { + case "allocated": + return schema.NodeStateAllocated + case "reserved": + return schema.NodeStateReserved + case "idle": + return schema.NodeStateIdle + case "down": + return schema.NodeStateDown + case "mixed": + return schema.NodeStateMixed + } + } + + return schema.NodeStateUnknown +} + +// updateNodeStates godoc +// @summary Deliver updated Slurm node states +// @tags node +// @description Returns a JSON-encoded list of users. +// @description Required query-parameter defines if all users or only users with additional special roles are returned. +// @produce json +// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states" +// @success 200 {array} api.SuccessResponse "Success" +// @failure 400 {string} string "Bad Request" +// @failure 401 {string} string "Unauthorized" +// @failure 403 {string} string "Forbidden" +// @failure 500 {string} string "Internal Server Error" +// @security ApiKeyAuth +// @router /api/nodestats/ [post] func (api *RestApi) updateNodeStates(rw http.ResponseWriter, r *http.Request) { // Parse request body req := UpdateNodeStatesRequest{} @@ -33,9 +67,7 @@ func (api *RestApi) updateNodeStates(rw http.ResponseWriter, r *http.Request) { repo := repository.GetNodeRepository() for _, node := range req.Nodes { - state := schema.NodeStateUnknown - // TODO: Determine valid node state + state := determineState(node.States) repo.UpdateNodeState(node.Name, req.Cluster, &state) - } } From 9cd4b3c1cc79f3cde124f48a61aafcbbd0101349 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 5 Jun 2025 16:20:48 +0200 Subject: [PATCH 45/45] Convert to all lower case --- internal/api/node.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/api/node.go b/internal/api/node.go index 62c1e2a..7a582ed 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -7,6 +7,7 @@ package api import ( "fmt" "net/http" + "strings" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/schema" @@ -26,7 +27,7 @@ type UpdateNodeStatesRequest struct { // this routine assumes that only one of them applies per node func determineState(states []string) schema.NodeState { for _, state := range states { - switch state { + switch strings.ToLower(state) { case "allocated": return schema.NodeStateAllocated case "reserved":