From c920c57f5d7d88550f161a95fd51b62bd24ff2a5 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Sat, 7 Feb 2026 10:51:56 +0100 Subject: [PATCH 01/31] Add parquet file job archiving target --- go.mod | 11 +- go.sum | 13 ++ internal/taskmanager/retentionService.go | 94 ++++++++++ internal/taskmanager/taskManager.go | 21 ++- pkg/archive/ConfigSchema.go | 39 +++- pkg/archive/parquet/convert.go | 116 ++++++++++++ pkg/archive/parquet/schema.go | 32 ++++ pkg/archive/parquet/target.go | 100 ++++++++++ pkg/archive/parquet/writer.go | 113 ++++++++++++ pkg/archive/parquet/writer_test.go | 225 +++++++++++++++++++++++ 10 files changed, 755 insertions(+), 9 deletions(-) create mode 100644 pkg/archive/parquet/convert.go create mode 100644 pkg/archive/parquet/schema.go create mode 100644 pkg/archive/parquet/target.go create mode 100644 pkg/archive/parquet/writer.go create mode 100644 pkg/archive/parquet/writer_test.go diff --git a/go.mod b/go.mod index da712da9..af27227a 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,6 @@ module github.com/ClusterCockpit/cc-backend -go 1.24.0 - -toolchain go1.24.1 +go 1.24.9 tool ( github.com/99designs/gqlgen @@ -47,6 +45,7 @@ require ( github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect github.com/KyleBanks/depth v1.2.1 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect + github.com/andybalholm/brotli v1.1.1 // indirect github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 // indirect @@ -98,6 +97,10 @@ require ( github.com/nats-io/nkeys v0.4.12 // indirect github.com/nats-io/nuid v1.0.1 // indirect github.com/oapi-codegen/runtime v1.1.1 // indirect + github.com/parquet-go/bitpack v1.0.0 // indirect + github.com/parquet-go/jsonlite v1.0.0 // indirect + github.com/parquet-go/parquet-go v0.27.0 // indirect + github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/common v0.67.4 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect @@ -106,6 +109,7 @@ require ( github.com/stmcginnis/gofish v0.20.0 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/swaggo/files v1.0.1 // indirect + github.com/twpayne/go-geom v1.6.1 // indirect github.com/urfave/cli/v2 v2.27.7 // indirect github.com/urfave/cli/v3 v3.6.1 // indirect github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect @@ -118,6 +122,7 @@ require ( golang.org/x/sys v0.39.0 // indirect golang.org/x/text v0.32.0 // indirect golang.org/x/tools v0.40.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 43331fce..a9cb9ddb 100644 --- a/go.sum +++ b/go.sum @@ -21,6 +21,8 @@ github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktp github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= +github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA= +github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM= @@ -238,6 +240,14 @@ github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLA github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro= github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= +github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA= +github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs= +github.com/parquet-go/jsonlite v1.0.0 h1:87QNdi56wOfsE5bdgas0vRzHPxfJgzrXGml1zZdd7VU= +github.com/parquet-go/jsonlite v1.0.0/go.mod h1:nDjpkpL4EOtqs6NQugUsi0Rleq9sW/OtC1NnZEnxzF0= +github.com/parquet-go/parquet-go v0.27.0 h1:vHWK2xaHbj+v1DYps03yDRpEsdtOeKbhiXUaixoPb3g= +github.com/parquet-go/parquet-go v0.27.0/go.mod h1:navtkAYr2LGoJVp141oXPlO/sxLvaOe3la2JEoD8+rg= +github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= +github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= @@ -285,6 +295,8 @@ github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64 github.com/swaggo/http-swagger v1.3.4/go.mod h1:9dAh0unqMBAlbp1uE2Uc2mQTxNMU/ha4UbucIg1MFkQ= github.com/swaggo/swag v1.16.6 h1:qBNcx53ZaX+M5dxVyTrgQ0PJ/ACK+NzhwcbieTt+9yI= github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4Xesg= +github.com/twpayne/go-geom v1.6.1 h1:iLE+Opv0Ihm/ABIcvQFGIiFBXd76oBIar9drAwHFhR4= +github.com/twpayne/go-geom v1.6.1/go.mod h1:Kr+Nly6BswFsKM5sd31YaoWS5PeDDH2NftJTK7Gd028= github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU= github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4= github.com/urfave/cli/v3 v3.6.1 h1:j8Qq8NyUawj/7rTYdBGrxcH7A/j7/G8Q5LhWEW4G3Mo= @@ -293,6 +305,7 @@ github.com/vektah/gqlparser/v2 v2.5.31 h1:YhWGA1mfTjID7qJhd1+Vxhpk5HTgydrGU9IgkW github.com/vektah/gqlparser/v2 v2.5.31/go.mod h1:c1I28gSOVNzlfc4WuDlqU7voQnsqI6OG2amkBAFmgts= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= +github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= diff --git a/internal/taskmanager/retentionService.go b/internal/taskmanager/retentionService.go index 5678cd14..453d10bc 100644 --- a/internal/taskmanager/retentionService.go +++ b/internal/taskmanager/retentionService.go @@ -9,6 +9,7 @@ import ( "time" "github.com/ClusterCockpit/cc-backend/pkg/archive" + pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/go-co-op/gocron/v2" ) @@ -66,3 +67,96 @@ func RegisterRetentionMoveService(age int, includeDB bool, location string, omit } })) } + +func RegisterRetentionParquetService(retention Retention) { + cclog.Info("Register retention parquet service") + + maxFileSizeMB := retention.MaxFileSizeMB + if maxFileSizeMB <= 0 { + maxFileSizeMB = 512 + } + + var target pqarchive.ParquetTarget + var err error + + switch retention.TargetKind { + case "s3": + target, err = pqarchive.NewS3Target(pqarchive.S3TargetConfig{ + Endpoint: retention.TargetEndpoint, + Bucket: retention.TargetBucket, + AccessKey: retention.TargetAccessKey, + SecretKey: retention.TargetSecretKey, + Region: retention.TargetRegion, + UsePathStyle: retention.TargetUsePathStyle, + }) + default: + target, err = pqarchive.NewFileTarget(retention.TargetPath) + } + + if err != nil { + cclog.Errorf("Parquet retention: failed to create target: %v", err) + return + } + + s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(5, 0, 0))), + gocron.NewTask( + func() { + startTime := time.Now().Unix() - int64(retention.Age*24*3600) + jobs, err := jobRepo.FindJobsBetween(0, startTime, retention.OmitTagged) + if err != nil { + cclog.Warnf("Parquet retention: error finding jobs: %v", err) + return + } + if len(jobs) == 0 { + return + } + + cclog.Infof("Parquet retention: processing %d jobs", len(jobs)) + ar := archive.GetHandle() + pw := pqarchive.NewParquetWriter(target, maxFileSizeMB) + + for _, job := range jobs { + meta, err := ar.LoadJobMeta(job) + if err != nil { + cclog.Warnf("Parquet retention: load meta for job %d: %v", job.JobID, err) + continue + } + + data, err := ar.LoadJobData(job) + if err != nil { + cclog.Warnf("Parquet retention: load data for job %d: %v", job.JobID, err) + continue + } + + row, err := pqarchive.JobToParquetRow(meta, &data) + if err != nil { + cclog.Warnf("Parquet retention: convert job %d: %v", job.JobID, err) + continue + } + + if err := pw.AddJob(*row); err != nil { + cclog.Errorf("Parquet retention: add job %d to writer: %v", job.JobID, err) + continue + } + } + + if err := pw.Close(); err != nil { + cclog.Errorf("Parquet retention: close writer: %v", err) + return + } + + ar.CleanUp(jobs) + + if retention.IncludeDB { + cnt, err := jobRepo.DeleteJobsBefore(startTime, retention.OmitTagged) + if err != nil { + cclog.Errorf("Parquet retention: delete jobs from db: %v", err) + } else { + cclog.Infof("Parquet retention: removed %d jobs from db", cnt) + } + if err = jobRepo.Optimize(); err != nil { + cclog.Errorf("Parquet retention: db optimization error: %v", err) + } + } + })) +} diff --git a/internal/taskmanager/taskManager.go b/internal/taskmanager/taskManager.go index cbc4120f..e323557b 100644 --- a/internal/taskmanager/taskManager.go +++ b/internal/taskmanager/taskManager.go @@ -23,11 +23,20 @@ const ( // Retention defines the configuration for job retention policies. type Retention struct { - Policy string `json:"policy"` - Location string `json:"location"` - Age int `json:"age"` - IncludeDB bool `json:"includeDB"` - OmitTagged bool `json:"omitTagged"` + Policy string `json:"policy"` + Location string `json:"location"` + Age int `json:"age"` + IncludeDB bool `json:"includeDB"` + OmitTagged bool `json:"omitTagged"` + TargetKind string `json:"target-kind"` + TargetPath string `json:"target-path"` + TargetEndpoint string `json:"target-endpoint"` + TargetBucket string `json:"target-bucket"` + TargetAccessKey string `json:"target-access-key"` + TargetSecretKey string `json:"target-secret-key"` + TargetRegion string `json:"target-region"` + TargetUsePathStyle bool `json:"target-use-path-style"` + MaxFileSizeMB int `json:"max-file-size-mb"` } // CronFrequency defines the execution intervals for various background workers. @@ -87,6 +96,8 @@ func initArchiveServices(config json.RawMessage) { cfg.Retention.IncludeDB, cfg.Retention.Location, cfg.Retention.OmitTagged) + case "parquet": + RegisterRetentionParquetService(cfg.Retention) } if cfg.Compression > 0 { diff --git a/pkg/archive/ConfigSchema.go b/pkg/archive/ConfigSchema.go index aebcf37b..db568200 100644 --- a/pkg/archive/ConfigSchema.go +++ b/pkg/archive/ConfigSchema.go @@ -57,7 +57,7 @@ var configSchema = ` "policy": { "description": "Retention policy", "type": "string", - "enum": ["none", "delete", "move"] + "enum": ["none", "delete", "move", "parquet"] }, "include-db": { "description": "Also remove jobs from database", @@ -70,6 +70,43 @@ var configSchema = ` "location": { "description": "The target directory for retention. Only applicable for retention move.", "type": "string" + }, + "target-kind": { + "description": "Target storage kind for parquet retention: file or s3", + "type": "string", + "enum": ["file", "s3"] + }, + "target-path": { + "description": "Target directory path for parquet file storage", + "type": "string" + }, + "target-endpoint": { + "description": "S3 endpoint URL for parquet target", + "type": "string" + }, + "target-bucket": { + "description": "S3 bucket name for parquet target", + "type": "string" + }, + "target-access-key": { + "description": "S3 access key for parquet target", + "type": "string" + }, + "target-secret-key": { + "description": "S3 secret key for parquet target", + "type": "string" + }, + "target-region": { + "description": "S3 region for parquet target", + "type": "string" + }, + "target-use-path-style": { + "description": "Use path-style S3 URLs for parquet target", + "type": "boolean" + }, + "max-file-size-mb": { + "description": "Maximum parquet file size in MB before splitting", + "type": "integer" } }, "required": ["policy"] diff --git a/pkg/archive/parquet/convert.go b/pkg/archive/parquet/convert.go new file mode 100644 index 00000000..ceaa3f2f --- /dev/null +++ b/pkg/archive/parquet/convert.go @@ -0,0 +1,116 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "compress/gzip" + "encoding/json" + "fmt" + + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// JobToParquetRow converts job metadata and metric data into a flat ParquetJobRow. +// Nested fields are marshaled to JSON; metric data is gzip-compressed JSON. +func JobToParquetRow(meta *schema.Job, data *schema.JobData) (*ParquetJobRow, error) { + resourcesJSON, err := json.Marshal(meta.Resources) + if err != nil { + return nil, fmt.Errorf("marshal resources: %w", err) + } + + var statisticsJSON []byte + if meta.Statistics != nil { + statisticsJSON, err = json.Marshal(meta.Statistics) + if err != nil { + return nil, fmt.Errorf("marshal statistics: %w", err) + } + } + + var tagsJSON []byte + if len(meta.Tags) > 0 { + tagsJSON, err = json.Marshal(meta.Tags) + if err != nil { + return nil, fmt.Errorf("marshal tags: %w", err) + } + } + + var metaDataJSON []byte + if meta.MetaData != nil { + metaDataJSON, err = json.Marshal(meta.MetaData) + if err != nil { + return nil, fmt.Errorf("marshal metadata: %w", err) + } + } + + var footprintJSON []byte + if meta.Footprint != nil { + footprintJSON, err = json.Marshal(meta.Footprint) + if err != nil { + return nil, fmt.Errorf("marshal footprint: %w", err) + } + } + + var energyFootJSON []byte + if meta.EnergyFootprint != nil { + energyFootJSON, err = json.Marshal(meta.EnergyFootprint) + if err != nil { + return nil, fmt.Errorf("marshal energy footprint: %w", err) + } + } + + metricDataGz, err := compressJobData(data) + if err != nil { + return nil, fmt.Errorf("compress metric data: %w", err) + } + + return &ParquetJobRow{ + JobID: meta.JobID, + Cluster: meta.Cluster, + SubCluster: meta.SubCluster, + Partition: meta.Partition, + Project: meta.Project, + User: meta.User, + State: string(meta.State), + StartTime: meta.StartTime, + Duration: meta.Duration, + Walltime: meta.Walltime, + NumNodes: meta.NumNodes, + NumHWThreads: meta.NumHWThreads, + NumAcc: meta.NumAcc, + Exclusive: meta.Exclusive, + Energy: meta.Energy, + SMT: meta.SMT, + ResourcesJSON: resourcesJSON, + StatisticsJSON: statisticsJSON, + TagsJSON: tagsJSON, + MetaDataJSON: metaDataJSON, + FootprintJSON: footprintJSON, + EnergyFootJSON: energyFootJSON, + MetricDataGz: metricDataGz, + }, nil +} + +func compressJobData(data *schema.JobData) ([]byte, error) { + jsonBytes, err := json.Marshal(data) + if err != nil { + return nil, err + } + + var buf bytes.Buffer + gz, err := gzip.NewWriterLevel(&buf, gzip.BestCompression) + if err != nil { + return nil, err + } + if _, err := gz.Write(jsonBytes); err != nil { + return nil, err + } + if err := gz.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} diff --git a/pkg/archive/parquet/schema.go b/pkg/archive/parquet/schema.go new file mode 100644 index 00000000..74f82599 --- /dev/null +++ b/pkg/archive/parquet/schema.go @@ -0,0 +1,32 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +type ParquetJobRow struct { + JobID int64 `parquet:"job_id"` + Cluster string `parquet:"cluster"` + SubCluster string `parquet:"sub_cluster"` + Partition string `parquet:"partition,optional"` + Project string `parquet:"project"` + User string `parquet:"user"` + State string `parquet:"job_state"` + StartTime int64 `parquet:"start_time"` + Duration int32 `parquet:"duration"` + Walltime int64 `parquet:"walltime"` + NumNodes int32 `parquet:"num_nodes"` + NumHWThreads int32 `parquet:"num_hwthreads"` + NumAcc int32 `parquet:"num_acc"` + Exclusive int32 `parquet:"exclusive"` + Energy float64 `parquet:"energy"` + SMT int32 `parquet:"smt"` + ResourcesJSON []byte `parquet:"resources_json"` + StatisticsJSON []byte `parquet:"statistics_json,optional"` + TagsJSON []byte `parquet:"tags_json,optional"` + MetaDataJSON []byte `parquet:"meta_data_json,optional"` + FootprintJSON []byte `parquet:"footprint_json,optional"` + EnergyFootJSON []byte `parquet:"energy_footprint_json,optional"` + MetricDataGz []byte `parquet:"metric_data_gz"` +} diff --git a/pkg/archive/parquet/target.go b/pkg/archive/parquet/target.go new file mode 100644 index 00000000..0e8babc2 --- /dev/null +++ b/pkg/archive/parquet/target.go @@ -0,0 +1,100 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "context" + "fmt" + "os" + "path/filepath" + + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" +) + +// ParquetTarget abstracts the destination for parquet file writes. +type ParquetTarget interface { + WriteFile(name string, data []byte) error +} + +// FileTarget writes parquet files to a local filesystem directory. +type FileTarget struct { + path string +} + +func NewFileTarget(path string) (*FileTarget, error) { + if err := os.MkdirAll(path, 0o750); err != nil { + return nil, fmt.Errorf("create target directory: %w", err) + } + return &FileTarget{path: path}, nil +} + +func (ft *FileTarget) WriteFile(name string, data []byte) error { + return os.WriteFile(filepath.Join(ft.path, name), data, 0o640) +} + +// S3TargetConfig holds the configuration for an S3 parquet target. +type S3TargetConfig struct { + Endpoint string + Bucket string + AccessKey string + SecretKey string + Region string + UsePathStyle bool +} + +// S3Target writes parquet files to an S3-compatible object store. +type S3Target struct { + client *s3.Client + bucket string +} + +func NewS3Target(cfg S3TargetConfig) (*S3Target, error) { + if cfg.Bucket == "" { + return nil, fmt.Errorf("S3 target: empty bucket name") + } + + region := cfg.Region + if region == "" { + region = "us-east-1" + } + + awsCfg, err := awsconfig.LoadDefaultConfig(context.Background(), + awsconfig.WithRegion(region), + awsconfig.WithCredentialsProvider( + credentials.NewStaticCredentialsProvider(cfg.AccessKey, cfg.SecretKey, ""), + ), + ) + if err != nil { + return nil, fmt.Errorf("S3 target: load AWS config: %w", err) + } + + opts := func(o *s3.Options) { + if cfg.Endpoint != "" { + o.BaseEndpoint = aws.String(cfg.Endpoint) + } + o.UsePathStyle = cfg.UsePathStyle + } + + client := s3.NewFromConfig(awsCfg, opts) + return &S3Target{client: client, bucket: cfg.Bucket}, nil +} + +func (st *S3Target) WriteFile(name string, data []byte) error { + _, err := st.client.PutObject(context.Background(), &s3.PutObjectInput{ + Bucket: aws.String(st.bucket), + Key: aws.String(name), + Body: bytes.NewReader(data), + ContentType: aws.String("application/vnd.apache.parquet"), + }) + if err != nil { + return fmt.Errorf("S3 target: put object %q: %w", name, err) + } + return nil +} diff --git a/pkg/archive/parquet/writer.go b/pkg/archive/parquet/writer.go new file mode 100644 index 00000000..ab56cace --- /dev/null +++ b/pkg/archive/parquet/writer.go @@ -0,0 +1,113 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "fmt" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + pq "github.com/parquet-go/parquet-go" +) + +// ParquetWriter batches ParquetJobRows and flushes them to a target +// when the estimated size exceeds maxSizeBytes. +type ParquetWriter struct { + target ParquetTarget + maxSizeBytes int64 + rows []ParquetJobRow + currentSize int64 + fileCounter int + datePrefix string +} + +// NewParquetWriter creates a new writer that flushes batches to the given target. +// maxSizeMB sets the approximate maximum size per parquet file in megabytes. +func NewParquetWriter(target ParquetTarget, maxSizeMB int) *ParquetWriter { + return &ParquetWriter{ + target: target, + maxSizeBytes: int64(maxSizeMB) * 1024 * 1024, + datePrefix: time.Now().Format("2006-01-02"), + } +} + +// AddJob adds a row to the current batch. If the estimated batch size +// exceeds the configured maximum, the batch is flushed to the target first. +func (pw *ParquetWriter) AddJob(row ParquetJobRow) error { + rowSize := estimateRowSize(&row) + + if pw.currentSize+rowSize > pw.maxSizeBytes && len(pw.rows) > 0 { + if err := pw.Flush(); err != nil { + return err + } + } + + pw.rows = append(pw.rows, row) + pw.currentSize += rowSize + return nil +} + +// Flush writes the current batch to a parquet file on the target. +func (pw *ParquetWriter) Flush() error { + if len(pw.rows) == 0 { + return nil + } + + pw.fileCounter++ + fileName := fmt.Sprintf("cc-archive-%s-%03d.parquet", pw.datePrefix, pw.fileCounter) + + data, err := writeParquetBytes(pw.rows) + if err != nil { + return fmt.Errorf("write parquet buffer: %w", err) + } + + if err := pw.target.WriteFile(fileName, data); err != nil { + return fmt.Errorf("write parquet file %q: %w", fileName, err) + } + + cclog.Infof("Parquet retention: wrote %s (%d jobs, %d bytes)", fileName, len(pw.rows), len(data)) + pw.rows = pw.rows[:0] + pw.currentSize = 0 + return nil +} + +// Close flushes any remaining rows and finalizes the writer. +func (pw *ParquetWriter) Close() error { + return pw.Flush() +} + +func writeParquetBytes(rows []ParquetJobRow) ([]byte, error) { + var buf bytes.Buffer + + writer := pq.NewGenericWriter[ParquetJobRow](&buf, + pq.Compression(&pq.Snappy), + ) + + if _, err := writer.Write(rows); err != nil { + return nil, err + } + if err := writer.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +func estimateRowSize(row *ParquetJobRow) int64 { + // Fixed fields: ~100 bytes for numeric fields + strings estimate + size := int64(200) + size += int64(len(row.Cluster) + len(row.SubCluster) + len(row.Partition) + + len(row.Project) + len(row.User) + len(row.State)) + size += int64(len(row.ResourcesJSON)) + size += int64(len(row.StatisticsJSON)) + size += int64(len(row.TagsJSON)) + size += int64(len(row.MetaDataJSON)) + size += int64(len(row.FootprintJSON)) + size += int64(len(row.EnergyFootJSON)) + size += int64(len(row.MetricDataGz)) + return size +} diff --git a/pkg/archive/parquet/writer_test.go b/pkg/archive/parquet/writer_test.go new file mode 100644 index 00000000..6baaa527 --- /dev/null +++ b/pkg/archive/parquet/writer_test.go @@ -0,0 +1,225 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "compress/gzip" + "encoding/json" + "io" + "sync" + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + pq "github.com/parquet-go/parquet-go" +) + +// memTarget collects written files in memory for testing. +type memTarget struct { + mu sync.Mutex + files map[string][]byte +} + +func newMemTarget() *memTarget { + return &memTarget{files: make(map[string][]byte)} +} + +func (m *memTarget) WriteFile(name string, data []byte) error { + m.mu.Lock() + defer m.mu.Unlock() + m.files[name] = append([]byte(nil), data...) + return nil +} + +func makeTestJob(jobID int64) (*schema.Job, *schema.JobData) { + meta := &schema.Job{ + JobID: jobID, + Cluster: "testcluster", + SubCluster: "sc0", + Project: "testproject", + User: "testuser", + State: schema.JobStateCompleted, + StartTime: 1700000000, + Duration: 3600, + Walltime: 7200, + NumNodes: 2, + NumHWThreads: 16, + Exclusive: 1, + SMT: 1, + Resources: []*schema.Resource{ + {Hostname: "node001"}, + {Hostname: "node002"}, + }, + } + + data := schema.JobData{ + "cpu_load": { + schema.MetricScopeNode: &schema.JobMetric{ + Unit: schema.Unit{Base: ""}, + Timestep: 60, + Series: []schema.Series{ + { + Hostname: "node001", + Data: []schema.Float{1.0, 2.0, 3.0}, + }, + }, + }, + }, + } + + return meta, &data +} + +func TestJobToParquetRowConversion(t *testing.T) { + meta, data := makeTestJob(1001) + meta.Tags = []*schema.Tag{{Type: "test", Name: "tag1"}} + meta.MetaData = map[string]string{"key": "value"} + + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("JobToParquetRow: %v", err) + } + + if row.JobID != 1001 { + t.Errorf("JobID = %d, want 1001", row.JobID) + } + if row.Cluster != "testcluster" { + t.Errorf("Cluster = %q, want %q", row.Cluster, "testcluster") + } + if row.User != "testuser" { + t.Errorf("User = %q, want %q", row.User, "testuser") + } + if row.State != "completed" { + t.Errorf("State = %q, want %q", row.State, "completed") + } + if row.NumNodes != 2 { + t.Errorf("NumNodes = %d, want 2", row.NumNodes) + } + + // Verify resources JSON + var resources []*schema.Resource + if err := json.Unmarshal(row.ResourcesJSON, &resources); err != nil { + t.Fatalf("unmarshal resources: %v", err) + } + if len(resources) != 2 { + t.Errorf("resources len = %d, want 2", len(resources)) + } + + // Verify tags JSON + var tags []*schema.Tag + if err := json.Unmarshal(row.TagsJSON, &tags); err != nil { + t.Fatalf("unmarshal tags: %v", err) + } + if len(tags) != 1 || tags[0].Name != "tag1" { + t.Errorf("tags = %v, want [{test tag1}]", tags) + } + + // Verify metric data is gzip-compressed valid JSON + gz, err := gzip.NewReader(bytes.NewReader(row.MetricDataGz)) + if err != nil { + t.Fatalf("gzip reader: %v", err) + } + decompressed, err := io.ReadAll(gz) + if err != nil { + t.Fatalf("gzip read: %v", err) + } + var jobData schema.JobData + if err := json.Unmarshal(decompressed, &jobData); err != nil { + t.Fatalf("unmarshal metric data: %v", err) + } + if _, ok := jobData["cpu_load"]; !ok { + t.Error("metric data missing cpu_load key") + } +} + +func TestParquetWriterSingleBatch(t *testing.T) { + target := newMemTarget() + pw := NewParquetWriter(target, 512) + + for i := int64(0); i < 5; i++ { + meta, data := makeTestJob(i) + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert job %d: %v", i, err) + } + if err := pw.AddJob(*row); err != nil { + t.Fatalf("add job %d: %v", i, err) + } + } + + if err := pw.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + if len(target.files) != 1 { + t.Fatalf("expected 1 file, got %d", len(target.files)) + } + + // Verify the parquet file is readable + for name, data := range target.files { + file := bytes.NewReader(data) + pf, err := pq.OpenFile(file, int64(len(data))) + if err != nil { + t.Fatalf("open parquet %s: %v", name, err) + } + if pf.NumRows() != 5 { + t.Errorf("parquet rows = %d, want 5", pf.NumRows()) + } + } +} + +func TestParquetWriterBatching(t *testing.T) { + target := newMemTarget() + // Use a very small max size to force multiple files + pw := NewParquetWriter(target, 0) // 0 MB means every job triggers a flush + pw.maxSizeBytes = 1 // Force flush after every row + + for i := int64(0); i < 3; i++ { + meta, data := makeTestJob(i) + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert job %d: %v", i, err) + } + if err := pw.AddJob(*row); err != nil { + t.Fatalf("add job %d: %v", i, err) + } + } + + if err := pw.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + // With maxSizeBytes=1, each AddJob should flush the previous batch, + // resulting in multiple files + if len(target.files) < 2 { + t.Errorf("expected multiple files due to batching, got %d", len(target.files)) + } + + // Verify all files are valid parquet + for name, data := range target.files { + file := bytes.NewReader(data) + _, err := pq.OpenFile(file, int64(len(data))) + if err != nil { + t.Errorf("invalid parquet file %s: %v", name, err) + } + } +} + +func TestFileTarget(t *testing.T) { + dir := t.TempDir() + ft, err := NewFileTarget(dir) + if err != nil { + t.Fatalf("NewFileTarget: %v", err) + } + + testData := []byte("test parquet data") + if err := ft.WriteFile("test.parquet", testData); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + // Verify file exists and has correct content + // (using the target itself is sufficient; we just check no error) +} From f6aa40d9276a9da368a825271502b6e267030d1a Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Sat, 7 Feb 2026 17:48:12 +0100 Subject: [PATCH 02/31] Migrate from gorilla to chi web framework. add 404 handler --- cmd/cc-backend/server.go | 243 +++++++++++++++++--------------- go.mod | 8 +- go.sum | 17 ++- internal/api/api_test.go | 6 +- internal/api/job.go | 26 ++-- internal/api/rest.go | 112 +++++++-------- internal/api/user.go | 10 +- internal/auth/oidc.go | 4 +- internal/routerConfig/routes.go | 54 +++---- 9 files changed, 253 insertions(+), 227 deletions(-) diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index 4035c430..bd704eb4 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -14,7 +14,6 @@ import ( "encoding/json" "errors" "fmt" - "io" "net" "net/http" "os" @@ -36,8 +35,9 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/nats" "github.com/ClusterCockpit/cc-lib/v2/runtime" - "github.com/gorilla/handlers" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" + "github.com/go-chi/chi/v5/middleware" + "github.com/go-chi/cors" httpSwagger "github.com/swaggo/http-swagger" ) @@ -50,7 +50,7 @@ const ( // Server encapsulates the HTTP server state and dependencies type Server struct { - router *mux.Router + router chi.Router server *http.Server restAPIHandle *api.RestAPI natsAPIHandle *api.NatsAPI @@ -70,7 +70,7 @@ func NewServer(version, commit, buildDate string) (*Server, error) { buildInfo = web.Build{Version: version, Hash: commit, Buildtime: buildDate} s := &Server{ - router: mux.NewRouter(), + router: chi.NewRouter(), } if err := s.init(); err != nil { @@ -117,11 +117,11 @@ func (s *Server) init() error { info["hasOpenIDConnect"] = true } - s.router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) { + s.router.Get("/login", func(rw http.ResponseWriter, r *http.Request) { rw.Header().Add("Content-Type", "text/html; charset=utf-8") cclog.Debugf("##%v##", info) web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info}) - }).Methods(http.MethodGet) + }) s.router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) { rw.Header().Add("Content-Type", "text/html; charset=utf-8") web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo}) @@ -131,13 +131,6 @@ func (s *Server) init() error { web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo}) }) - secured := s.router.PathPrefix("/").Subrouter() - securedapi := s.router.PathPrefix("/api").Subrouter() - userapi := s.router.PathPrefix("/userapi").Subrouter() - configapi := s.router.PathPrefix("/config").Subrouter() - frontendapi := s.router.PathPrefix("/frontend").Subrouter() - metricstoreapi := s.router.PathPrefix("/api").Subrouter() - if !config.Keys.DisableAuthentication { // Create login failure handler (used by both /login and /jwt-login) loginFailureHandler := func(rw http.ResponseWriter, r *http.Request, err error) { @@ -152,10 +145,10 @@ func (s *Server) init() error { }) } - s.router.Handle("/login", authHandle.Login(loginFailureHandler)).Methods(http.MethodPost) - s.router.Handle("/jwt-login", authHandle.Login(loginFailureHandler)) + s.router.Post("/login", authHandle.Login(loginFailureHandler).ServeHTTP) + s.router.HandleFunc("/jwt-login", authHandle.Login(loginFailureHandler).ServeHTTP) - s.router.Handle("/logout", authHandle.Logout( + s.router.Post("/logout", authHandle.Logout( http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { rw.Header().Add("Content-Type", "text/html; charset=utf-8") rw.WriteHeader(http.StatusOK) @@ -166,86 +159,83 @@ func (s *Server) init() error { Build: buildInfo, Infos: info, }) - }))).Methods(http.MethodPost) - - secured.Use(func(next http.Handler) http.Handler { - return authHandle.Auth( - // On success; - next, - - // On failure: - func(rw http.ResponseWriter, r *http.Request, err error) { - rw.WriteHeader(http.StatusUnauthorized) - web.RenderTemplate(rw, "login.tmpl", &web.Page{ - Title: "Authentication failed - ClusterCockpit", - MsgType: "alert-danger", - Message: err.Error(), - Build: buildInfo, - Infos: info, - Redirect: r.RequestURI, - }) - }) - }) - - securedapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - userapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthUserAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - metricstoreapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthMetricStoreAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - configapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthConfigAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - frontendapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthFrontendAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) + })).ServeHTTP) } if flagDev { s.router.Handle("/playground", playground.Handler("GraphQL playground", "/query")) - s.router.PathPrefix("/swagger/").Handler(httpSwagger.Handler( - httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet) + s.router.Get("/swagger/*", httpSwagger.Handler( + httpSwagger.URL("http://"+config.Keys.Addr+"/swagger/doc.json"))) } - secured.Handle("/query", graphQLServer) - // Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project. - secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) { - routerConfig.HandleSearchBar(rw, r, buildInfo) + // Secured routes (require authentication) + s.router.Group(func(secured chi.Router) { + if !config.Keys.DisableAuthentication { + secured.Use(func(next http.Handler) http.Handler { + return authHandle.Auth( + next, + func(rw http.ResponseWriter, r *http.Request, err error) { + rw.WriteHeader(http.StatusUnauthorized) + web.RenderTemplate(rw, "login.tmpl", &web.Page{ + Title: "Authentication failed - ClusterCockpit", + MsgType: "alert-danger", + Message: err.Error(), + Build: buildInfo, + Infos: info, + Redirect: r.RequestURI, + }) + }) + }) + } + + secured.Handle("/query", graphQLServer) + + secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) { + routerConfig.HandleSearchBar(rw, r, buildInfo) + }) + + routerConfig.SetupRoutes(secured, buildInfo) }) - // Mount all /monitoring/... and /api/... routes. - routerConfig.SetupRoutes(secured, buildInfo) - s.restAPIHandle.MountAPIRoutes(securedapi) - s.restAPIHandle.MountUserAPIRoutes(userapi) - s.restAPIHandle.MountConfigAPIRoutes(configapi) - s.restAPIHandle.MountFrontendAPIRoutes(frontendapi) + // API routes (JWT token auth) + s.router.Route("/api", func(securedapi chi.Router) { + if !config.Keys.DisableAuthentication { + securedapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountAPIRoutes(securedapi) + }) + + // User API routes + s.router.Route("/userapi", func(userapi chi.Router) { + if !config.Keys.DisableAuthentication { + userapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthUserAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountUserAPIRoutes(userapi) + }) + + // Config API routes + s.router.Route("/config", func(configapi chi.Router) { + if !config.Keys.DisableAuthentication { + configapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthConfigAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountConfigAPIRoutes(configapi) + }) + + // Frontend API routes + s.router.Route("/frontend", func(frontendapi chi.Router) { + if !config.Keys.DisableAuthentication { + frontendapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthFrontendAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountFrontendAPIRoutes(frontendapi) + }) if config.Keys.APISubjects != nil { s.natsAPIHandle = api.NewNatsAPI() @@ -254,27 +244,57 @@ func (s *Server) init() error { } } - s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi) + // Metric store API routes (mounted under /api but with different auth) + s.router.Route("/api", func(metricstoreapi chi.Router) { + if !config.Keys.DisableAuthentication { + metricstoreapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthMetricStoreAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi) + }) + + // Custom 404 handler for unmatched routes + s.router.NotFound(func(rw http.ResponseWriter, r *http.Request) { + if strings.HasPrefix(r.URL.Path, "/api/") || strings.HasPrefix(r.URL.Path, "/userapi/") || + strings.HasPrefix(r.URL.Path, "/frontend/") || strings.HasPrefix(r.URL.Path, "/config/") { + rw.Header().Set("Content-Type", "application/json") + rw.WriteHeader(http.StatusNotFound) + json.NewEncoder(rw).Encode(map[string]string{ + "status": "Resource not found", + "error": "the requested endpoint does not exist", + }) + return + } + rw.WriteHeader(http.StatusNotFound) + web.RenderTemplate(rw, "message.tmpl", &web.Page{ + Title: "Not Found", + MsgType: "alert-warning", + Message: "The requested page was not found.", + Build: buildInfo, + }) + }) if config.Keys.EmbedStaticFiles { if i, err := os.Stat("./var/img"); err == nil { if i.IsDir() { cclog.Info("Use local directory for static images") - s.router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img")))) + s.router.Handle("/img/*", http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img")))) } } - s.router.PathPrefix("/").Handler(http.StripPrefix("/", web.ServeFiles())) + s.router.Handle("/*", http.StripPrefix("/", web.ServeFiles())) } else { - s.router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles))) + s.router.Handle("/*", http.FileServer(http.Dir(config.Keys.StaticFiles))) } - s.router.Use(handlers.CompressHandler) - s.router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true))) - s.router.Use(handlers.CORS( - handlers.AllowCredentials(), - handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}), - handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}), - handlers.AllowedOrigins([]string{"*"}))) + s.router.Use(middleware.Compress(5)) + s.router.Use(middleware.Recoverer) + s.router.Use(cors.Handler(cors.Options{ + AllowCredentials: true, + AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}, + AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"}, + AllowedOrigins: []string{"*"}, + })) return nil } @@ -286,18 +306,17 @@ const ( ) func (s *Server) Start(ctx context.Context) error { - handler := handlers.CustomLoggingHandler(io.Discard, s.router, func(_ io.Writer, params handlers.LogFormatterParams) { - if strings.HasPrefix(params.Request.RequestURI, "/api/") { + // Add request logging middleware + s.router.Use(func(next http.Handler) http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + start := time.Now() + ww := middleware.NewWrapResponseWriter(rw, r.ProtoMajor) + next.ServeHTTP(ww, r) cclog.Debugf("%s %s (%d, %.02fkb, %dms)", - params.Request.Method, params.URL.RequestURI(), - params.StatusCode, float32(params.Size)/1024, - time.Since(params.TimeStamp).Milliseconds()) - } else { - cclog.Debugf("%s %s (%d, %.02fkb, %dms)", - params.Request.Method, params.URL.RequestURI(), - params.StatusCode, float32(params.Size)/1024, - time.Since(params.TimeStamp).Milliseconds()) - } + r.Method, r.URL.RequestURI(), + ww.Status(), float32(ww.BytesWritten())/1024, + time.Since(start).Milliseconds()) + }) }) // Use configurable timeouts with defaults @@ -307,7 +326,7 @@ func (s *Server) Start(ctx context.Context) error { s.server = &http.Server{ ReadTimeout: readTimeout, WriteTimeout: writeTimeout, - Handler: handler, + Handler: s.router, Addr: config.Keys.Addr, } diff --git a/go.mod b/go.mod index af27227a..77da0104 100644 --- a/go.mod +++ b/go.mod @@ -17,19 +17,20 @@ require ( github.com/aws/aws-sdk-go-v2/service/s3 v1.95.0 github.com/coreos/go-oidc/v3 v3.17.0 github.com/expr-lang/expr v1.17.7 + github.com/go-chi/chi/v5 v5.2.5 + github.com/go-chi/cors v1.2.2 github.com/go-co-op/gocron/v2 v2.19.0 github.com/go-ldap/ldap/v3 v3.4.12 github.com/golang-jwt/jwt/v5 v5.3.0 github.com/golang-migrate/migrate/v4 v4.19.1 github.com/google/gops v0.3.28 - github.com/gorilla/handlers v1.5.2 - github.com/gorilla/mux v1.8.1 github.com/gorilla/sessions v1.4.0 github.com/influxdata/line-protocol/v2 v2.2.1 github.com/jmoiron/sqlx v1.4.0 github.com/joho/godotenv v1.5.1 github.com/linkedin/goavro/v2 v2.14.1 github.com/mattn/go-sqlite3 v1.14.33 + github.com/parquet-go/parquet-go v0.27.0 github.com/qustavo/sqlhooks/v2 v2.1.0 github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 github.com/stretchr/testify v1.11.1 @@ -64,7 +65,6 @@ require ( github.com/aws/smithy-go v1.24.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 // indirect github.com/go-jose/go-jose/v4 v4.1.3 // indirect @@ -81,7 +81,6 @@ require ( github.com/go-viper/mapstructure/v2 v2.4.0 // indirect github.com/goccy/go-yaml v1.19.0 // indirect github.com/golang/snappy v0.0.4 // indirect - github.com/google/go-cmp v0.7.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect @@ -99,7 +98,6 @@ require ( github.com/oapi-codegen/runtime v1.1.1 // indirect github.com/parquet-go/bitpack v1.0.0 // indirect github.com/parquet-go/jsonlite v1.0.0 // indirect - github.com/parquet-go/parquet-go v0.27.0 // indirect github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/common v0.67.4 // indirect diff --git a/go.sum b/go.sum index a9cb9ddb..40b90751 100644 --- a/go.sum +++ b/go.sum @@ -6,6 +6,8 @@ github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+ github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= github.com/ClusterCockpit/cc-lib/v2 v2.2.1 h1:iCVas+Jc61zFH5S2VG3H1sc7tsn+U4lOJwUYjYZEims= github.com/ClusterCockpit/cc-lib/v2 v2.2.1/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= +github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= +github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= @@ -17,6 +19,10 @@ github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHf github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= +github.com/alecthomas/assert/v2 v2.10.0 h1:jjRCHsj6hBJhkmhznrCzoNpbA3zqy0fYiUcYZP/GkPY= +github.com/alecthomas/assert/v2 v2.10.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k= +github.com/alecthomas/repr v0.4.0 h1:GhI2A8MACjfegCPVq9f1FLvIBS+DrQ2KQBFZP1iFzXc= +github.com/alecthomas/repr v0.4.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ= @@ -87,8 +93,6 @@ github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7c github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= github.com/expr-lang/expr v1.17.7 h1:Q0xY/e/2aCIp8g9s/LGvMDCC5PxYlvHgDZRQ4y16JX8= github.com/expr-lang/expr v1.17.7/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= -github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= -github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= @@ -97,6 +101,10 @@ github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 h1:BP4M0CvQ4S3TGls2FvczZtj5Re/2ZzkV9VwqPHH/3Bo= github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= +github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug= +github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0= +github.com/go-chi/cors v1.2.2 h1:Jmey33TE+b+rB7fT8MUy1u0I4L+NARQlK6LhzKPSyQE= +github.com/go-chi/cors v1.2.2/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn836hqM7JxpglAy2Vzc58= github.com/go-co-op/gocron/v2 v2.19.0 h1:OKf2y6LXPs/BgBI2fl8PxUpNAI1DA9Mg+hSeGOS38OU= github.com/go-co-op/gocron/v2 v2.19.0/go.mod h1:5lEiCKk1oVJV39Zg7/YG10OnaVrDAV5GGR6O0663k6U= github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs= @@ -156,8 +164,6 @@ github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark= github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= -github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/securecookie v1.1.2 h1:YCIWL56dvtr73r6715mJs5ZvhtnY73hBvEF8kXD8ePA= @@ -170,6 +176,8 @@ github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/C github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= +github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjwJdUHnwvfjMF71M1iI4= github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= @@ -305,6 +313,7 @@ github.com/vektah/gqlparser/v2 v2.5.31 h1:YhWGA1mfTjID7qJhd1+Vxhpk5HTgydrGU9IgkW github.com/vektah/gqlparser/v2 v2.5.31/go.mod h1:c1I28gSOVNzlfc4WuDlqU7voQnsqI6OG2amkBAFmgts= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= +github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 8cbf95d7..09fc4c7f 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -30,7 +30,7 @@ import ( ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" _ "github.com/mattn/go-sqlite3" ) @@ -216,9 +216,7 @@ func TestRestApi(t *testing.T) { return testData, nil } - r := mux.NewRouter() - r.PathPrefix("/api").Subrouter() - r.StrictSlash(true) + r := chi.NewRouter() restapi.MountAPIRoutes(r) var TestJobID int64 = 123 diff --git a/internal/api/job.go b/internal/api/job.go index c3d1fbbf..d67dbb93 100644 --- a/internal/api/job.go +++ b/internal/api/job.go @@ -27,7 +27,7 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/archive" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" ) const ( @@ -243,10 +243,10 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) { // @router /api/jobs/{id} [get] func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) { // Fetch job from db - id, ok := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") var job *schema.Job var err error - if ok { + if id != "" { id, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) @@ -336,10 +336,10 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) // @router /api/jobs/{id} [post] func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) { // Fetch job from db - id, ok := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") var job *schema.Job var err error - if ok { + if id != "" { id, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) @@ -439,7 +439,7 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /api/jobs/edit_meta/{id} [post] func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) if err != nil { handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw) return @@ -487,7 +487,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /api/jobs/tag_job/{id} [post] func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) if err != nil { handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw) return @@ -551,7 +551,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /jobs/tag_job/{id} [delete] func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) if err != nil { handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw) return @@ -786,9 +786,9 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { // @router /api/jobs/delete_job/{id} [delete] func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) { // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") var err error - if ok { + if id != "" { id, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) @@ -885,9 +885,9 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { var cnt int // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["ts"] + id := chi.URLParam(r, "ts") var err error - if ok { + if id != "" { ts, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw) @@ -976,7 +976,7 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo } func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) { - id := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") metrics := r.URL.Query()["metric"] var scopes []schema.MetricScope for _, scope := range r.URL.Query()["scope"] { diff --git a/internal/api/rest.go b/internal/api/rest.go index 00ed1f55..90f64f18 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -25,7 +25,7 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/util" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" ) // @title ClusterCockpit REST API @@ -73,91 +73,93 @@ func New() *RestAPI { // MountAPIRoutes registers REST API endpoints for job and cluster management. // These routes use JWT token authentication via the X-Auth-Token header. -func (api *RestAPI) MountAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +func (api *RestAPI) MountAPIRoutes(r chi.Router) { // REST API Uses TokenAuth // User List - r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet) + r.Get("/users/", api.getUsers) // Cluster List - r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet) + r.Get("/clusters/", api.getClusters) // Slurm node state - r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut) + r.Post("/nodestate/", api.updateNodeStates) + r.Put("/nodestate/", api.updateNodeStates) // Job Handler if config.Keys.APISubjects == nil { cclog.Info("Enabling REST start/stop job API") - r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut) - r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut) + r.Post("/jobs/start_job/", api.startJob) + r.Put("/jobs/start_job/", api.startJob) + r.Post("/jobs/stop_job/", api.stopJobByRequest) + r.Put("/jobs/stop_job/", api.stopJobByRequest) } - r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet) - r.HandleFunc("/jobs/used_nodes", api.getUsedNodes).Methods(http.MethodGet) - r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch) - r.HandleFunc("/jobs/tag_job/{id}", api.removeTagJob).Methods(http.MethodDelete) - r.HandleFunc("/jobs/edit_meta/{id}", api.editMeta).Methods(http.MethodPost, http.MethodPatch) - r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) - r.HandleFunc("/jobs/delete_job/", api.deleteJobByRequest).Methods(http.MethodDelete) - r.HandleFunc("/jobs/delete_job/{id}", api.deleteJobByID).Methods(http.MethodDelete) - r.HandleFunc("/jobs/delete_job_before/{ts}", api.deleteJobBefore).Methods(http.MethodDelete) - r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost) - r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet) + r.Get("/jobs/", api.getJobs) + r.Get("/jobs/used_nodes", api.getUsedNodes) + r.Post("/jobs/tag_job/{id}", api.tagJob) + r.Patch("/jobs/tag_job/{id}", api.tagJob) + r.Delete("/jobs/tag_job/{id}", api.removeTagJob) + r.Post("/jobs/edit_meta/{id}", api.editMeta) + r.Patch("/jobs/edit_meta/{id}", api.editMeta) + r.Get("/jobs/metrics/{id}", api.getJobMetrics) + r.Delete("/jobs/delete_job/", api.deleteJobByRequest) + r.Delete("/jobs/delete_job/{id}", api.deleteJobByID) + r.Delete("/jobs/delete_job_before/{ts}", api.deleteJobBefore) + r.Post("/jobs/{id}", api.getJobByID) + r.Get("/jobs/{id}", api.getCompleteJobByID) - r.HandleFunc("/tags/", api.removeTags).Methods(http.MethodDelete) + r.Delete("/tags/", api.removeTags) if api.MachineStateDir != "" { - r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet) - r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost) + r.Get("/machine_state/{cluster}/{host}", api.getMachineState) + r.Put("/machine_state/{cluster}/{host}", api.putMachineState) + r.Post("/machine_state/{cluster}/{host}", api.putMachineState) } } // MountUserAPIRoutes registers user-accessible REST API endpoints. // These are limited endpoints for regular users with JWT token authentication. -func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +func (api *RestAPI) MountUserAPIRoutes(r chi.Router) { // REST API Uses TokenAuth - r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet) - r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost) - r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet) - r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) + r.Get("/jobs/", api.getJobs) + r.Post("/jobs/{id}", api.getJobByID) + r.Get("/jobs/{id}", api.getCompleteJobByID) + r.Get("/jobs/metrics/{id}", api.getJobMetrics) } // MountMetricStoreAPIRoutes registers metric storage API endpoints. // These endpoints handle metric data ingestion and health checks with JWT token authentication. -func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) { +func (api *RestAPI) MountMetricStoreAPIRoutes(r chi.Router) { // REST API Uses TokenAuth - // Note: StrictSlash handles trailing slash variations automatically - r.HandleFunc("/free", freeMetrics).Methods(http.MethodPost) - r.HandleFunc("/write", writeMetrics).Methods(http.MethodPost) - r.HandleFunc("/debug", debugMetrics).Methods(http.MethodGet) - r.HandleFunc("/healthcheck", api.updateNodeStates).Methods(http.MethodPost) + r.Post("/free", freeMetrics) + r.Post("/write", writeMetrics) + r.Get("/debug", debugMetrics) + r.Post("/healthcheck", api.updateNodeStates) // Same endpoints but with trailing slash - r.HandleFunc("/free/", freeMetrics).Methods(http.MethodPost) - r.HandleFunc("/write/", writeMetrics).Methods(http.MethodPost) - r.HandleFunc("/debug/", debugMetrics).Methods(http.MethodGet) - r.HandleFunc("/healthcheck/", api.updateNodeStates).Methods(http.MethodPost) + r.Post("/free/", freeMetrics) + r.Post("/write/", writeMetrics) + r.Get("/debug/", debugMetrics) + r.Post("/healthcheck/", api.updateNodeStates) } // MountConfigAPIRoutes registers configuration and user management endpoints. // These routes use session-based authentication and require admin privileges. -func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +func (api *RestAPI) MountConfigAPIRoutes(r chi.Router) { // Settings Frontend Uses SessionAuth if api.Authentication != nil { - r.HandleFunc("/roles/", api.getRoles).Methods(http.MethodGet) - r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut) - r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet) - r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete) - r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost) - r.HandleFunc("/notice/", api.editNotice).Methods(http.MethodPost) + r.Get("/roles/", api.getRoles) + r.Post("/users/", api.createUser) + r.Put("/users/", api.createUser) + r.Get("/users/", api.getUsers) + r.Delete("/users/", api.deleteUser) + r.Post("/user/{id}", api.updateUser) + r.Post("/notice/", api.editNotice) } } // MountFrontendAPIRoutes registers frontend-specific API endpoints. // These routes support JWT generation and user configuration updates with session authentication. -func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +func (api *RestAPI) MountFrontendAPIRoutes(r chi.Router) { // Settings Frontend Uses SessionAuth if api.Authentication != nil { - r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet) - r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost) + r.Get("/jwt/", api.getJWT) + r.Post("/configuration/", api.updateConfiguration) } } @@ -381,9 +383,8 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) { return } - vars := mux.Vars(r) - cluster := vars["cluster"] - host := vars["host"] + cluster := chi.URLParam(r, "cluster") + host := chi.URLParam(r, "host") if err := validatePathComponent(cluster, "cluster name"); err != nil { handleError(err, http.StatusBadRequest, rw) @@ -434,9 +435,8 @@ func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) { return } - vars := mux.Vars(r) - cluster := vars["cluster"] - host := vars["host"] + cluster := chi.URLParam(r, "cluster") + host := chi.URLParam(r, "host") if err := validatePathComponent(cluster, "cluster name"); err != nil { handleError(err, http.StatusBadRequest, rw) diff --git a/internal/api/user.go b/internal/api/user.go index 5564fd61..5eba0dfc 100644 --- a/internal/api/user.go +++ b/internal/api/user.go @@ -13,7 +13,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/repository" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" ) type APIReturnedUser struct { @@ -91,7 +91,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { // Handle role updates if newrole != "" { - if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil { + if err := repository.GetUserRepository().AddRole(r.Context(), chi.URLParam(r, "id"), newrole); err != nil { handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -99,7 +99,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } else if delrole != "" { - if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil { + if err := repository.GetUserRepository().RemoveRole(r.Context(), chi.URLParam(r, "id"), delrole); err != nil { handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -107,7 +107,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } else if newproj != "" { - if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil { + if err := repository.GetUserRepository().AddProject(r.Context(), chi.URLParam(r, "id"), newproj); err != nil { handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -115,7 +115,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } else if delproj != "" { - if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil { + if err := repository.GetUserRepository().RemoveProject(r.Context(), chi.URLParam(r, "id"), delproj); err != nil { handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw) return } diff --git a/internal/auth/oidc.go b/internal/auth/oidc.go index a3ff0c2c..f81b651f 100644 --- a/internal/auth/oidc.go +++ b/internal/auth/oidc.go @@ -18,7 +18,7 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/coreos/go-oidc/v3/oidc" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" "golang.org/x/oauth2" ) @@ -86,7 +86,7 @@ func NewOIDC(a *Authentication) *OIDC { return oa } -func (oa *OIDC) RegisterEndpoints(r *mux.Router) { +func (oa *OIDC) RegisterEndpoints(r chi.Router) { r.HandleFunc("/oidc-login", oa.OAuth2Login) r.HandleFunc("/oidc-callback", oa.OAuth2Callback) } diff --git a/internal/routerConfig/routes.go b/internal/routerConfig/routes.go index b8f6de95..e3978ddc 100644 --- a/internal/routerConfig/routes.go +++ b/internal/routerConfig/routes.go @@ -20,7 +20,7 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/util" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" ) type InfoType map[string]interface{} @@ -96,7 +96,7 @@ func setupConfigRoute(i InfoType, r *http.Request) InfoType { } func setupJobRoute(i InfoType, r *http.Request) InfoType { - i["id"] = mux.Vars(r)["id"] + i["id"] = chi.URLParam(r, "id") if config.Keys.EmissionConstant != 0 { i["emission"] = config.Keys.EmissionConstant } @@ -104,7 +104,7 @@ func setupJobRoute(i InfoType, r *http.Request) InfoType { } func setupUserRoute(i InfoType, r *http.Request) InfoType { - username := mux.Vars(r)["id"] + username := chi.URLParam(r, "id") i["id"] = username i["username"] = username // TODO: If forbidden (== err exists), redirect to error page @@ -116,33 +116,33 @@ func setupUserRoute(i InfoType, r *http.Request) InfoType { } func setupClusterStatusRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster i["displayType"] = "DASHBOARD" return i } func setupClusterDetailRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster i["displayType"] = "DETAILS" return i } func setupDashboardRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster i["displayType"] = "PUBLIC" // Used in Main Template return i } func setupClusterOverviewRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster i["displayType"] = "OVERVIEW" from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") @@ -154,11 +154,12 @@ func setupClusterOverviewRoute(i InfoType, r *http.Request) InfoType { } func setupClusterListRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] - i["sid"] = vars["subcluster"] - i["subCluster"] = vars["subcluster"] + cluster := chi.URLParam(r, "cluster") + subcluster := chi.URLParam(r, "subcluster") + i["id"] = cluster + i["cluster"] = cluster + i["sid"] = subcluster + i["subCluster"] = subcluster i["displayType"] = "LIST" from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") @@ -170,10 +171,11 @@ func setupClusterListRoute(i InfoType, r *http.Request) InfoType { } func setupNodeRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["cluster"] = vars["cluster"] - i["hostname"] = vars["hostname"] - i["id"] = fmt.Sprintf("%s (%s)", vars["cluster"], vars["hostname"]) + cluster := chi.URLParam(r, "cluster") + hostname := chi.URLParam(r, "hostname") + i["cluster"] = cluster + i["hostname"] = hostname + i["id"] = fmt.Sprintf("%s (%s)", cluster, hostname) from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") if from != "" && to != "" { i["from"] = from @@ -183,7 +185,7 @@ func setupNodeRoute(i InfoType, r *http.Request) InfoType { } func setupAnalysisRoute(i InfoType, r *http.Request) InfoType { - i["cluster"] = mux.Vars(r)["cluster"] + i["cluster"] = chi.URLParam(r, "cluster") return i } @@ -395,7 +397,7 @@ func buildFilterPresets(query url.Values) map[string]interface{} { return filterPresets } -func SetupRoutes(router *mux.Router, buildInfo web.Build) { +func SetupRoutes(router chi.Router, buildInfo web.Build) { userCfgRepo := repository.GetUserCfgRepo() for _, route := range routes { route := route From 2b395a94e633d5a50ec2a84d31ed9e93693762ef Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Sat, 7 Feb 2026 18:02:48 +0100 Subject: [PATCH 03/31] Fix setup issue with chi router --- cmd/cc-backend/server.go | 85 +++++++++++++++++++++------------------- internal/api/rest.go | 16 ++++---- 2 files changed, 53 insertions(+), 48 deletions(-) diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index bd704eb4..7e2b3ed3 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -106,6 +106,27 @@ func (s *Server) init() error { authHandle := auth.GetAuthInstance() + // Middleware must be defined before routes in chi + s.router.Use(func(next http.Handler) http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + start := time.Now() + ww := middleware.NewWrapResponseWriter(rw, r.ProtoMajor) + next.ServeHTTP(ww, r) + cclog.Debugf("%s %s (%d, %.02fkb, %dms)", + r.Method, r.URL.RequestURI(), + ww.Status(), float32(ww.BytesWritten())/1024, + time.Since(start).Milliseconds()) + }) + }) + s.router.Use(middleware.Compress(5)) + s.router.Use(middleware.Recoverer) + s.router.Use(cors.Handler(cors.Options{ + AllowCredentials: true, + AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}, + AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"}, + AllowedOrigins: []string{"*"}, + })) + s.restAPIHandle = api.New() info := map[string]any{} @@ -198,13 +219,26 @@ func (s *Server) init() error { }) // API routes (JWT token auth) - s.router.Route("/api", func(securedapi chi.Router) { - if !config.Keys.DisableAuthentication { - securedapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthAPI(next, onFailureResponse) - }) - } - s.restAPIHandle.MountAPIRoutes(securedapi) + s.router.Route("/api", func(apiRouter chi.Router) { + // Main API routes with API auth + apiRouter.Group(func(securedapi chi.Router) { + if !config.Keys.DisableAuthentication { + securedapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountAPIRoutes(securedapi) + }) + + // Metric store API routes with separate auth + apiRouter.Group(func(metricstoreapi chi.Router) { + if !config.Keys.DisableAuthentication { + metricstoreapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthMetricStoreAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi) + }) }) // User API routes @@ -217,8 +251,9 @@ func (s *Server) init() error { s.restAPIHandle.MountUserAPIRoutes(userapi) }) - // Config API routes - s.router.Route("/config", func(configapi chi.Router) { + // Config API routes (uses Group with full paths to avoid shadowing + // the /config page route that is registered in the secured group) + s.router.Group(func(configapi chi.Router) { if !config.Keys.DisableAuthentication { configapi.Use(func(next http.Handler) http.Handler { return authHandle.AuthConfigAPI(next, onFailureResponse) @@ -244,16 +279,6 @@ func (s *Server) init() error { } } - // Metric store API routes (mounted under /api but with different auth) - s.router.Route("/api", func(metricstoreapi chi.Router) { - if !config.Keys.DisableAuthentication { - metricstoreapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthMetricStoreAPI(next, onFailureResponse) - }) - } - s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi) - }) - // Custom 404 handler for unmatched routes s.router.NotFound(func(rw http.ResponseWriter, r *http.Request) { if strings.HasPrefix(r.URL.Path, "/api/") || strings.HasPrefix(r.URL.Path, "/userapi/") || @@ -287,15 +312,6 @@ func (s *Server) init() error { s.router.Handle("/*", http.FileServer(http.Dir(config.Keys.StaticFiles))) } - s.router.Use(middleware.Compress(5)) - s.router.Use(middleware.Recoverer) - s.router.Use(cors.Handler(cors.Options{ - AllowCredentials: true, - AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}, - AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"}, - AllowedOrigins: []string{"*"}, - })) - return nil } @@ -306,19 +322,6 @@ const ( ) func (s *Server) Start(ctx context.Context) error { - // Add request logging middleware - s.router.Use(func(next http.Handler) http.Handler { - return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { - start := time.Now() - ww := middleware.NewWrapResponseWriter(rw, r.ProtoMajor) - next.ServeHTTP(ww, r) - cclog.Debugf("%s %s (%d, %.02fkb, %dms)", - r.Method, r.URL.RequestURI(), - ww.Status(), float32(ww.BytesWritten())/1024, - time.Since(start).Milliseconds()) - }) - }) - // Use configurable timeouts with defaults readTimeout := time.Duration(defaultReadTimeout) * time.Second writeTimeout := time.Duration(defaultWriteTimeout) * time.Second diff --git a/internal/api/rest.go b/internal/api/rest.go index 90f64f18..575b1809 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -140,16 +140,18 @@ func (api *RestAPI) MountMetricStoreAPIRoutes(r chi.Router) { // MountConfigAPIRoutes registers configuration and user management endpoints. // These routes use session-based authentication and require admin privileges. +// Routes use full paths (including /config prefix) to avoid conflicting with +// the /config page route when registered via Group instead of Route. func (api *RestAPI) MountConfigAPIRoutes(r chi.Router) { // Settings Frontend Uses SessionAuth if api.Authentication != nil { - r.Get("/roles/", api.getRoles) - r.Post("/users/", api.createUser) - r.Put("/users/", api.createUser) - r.Get("/users/", api.getUsers) - r.Delete("/users/", api.deleteUser) - r.Post("/user/{id}", api.updateUser) - r.Post("/notice/", api.editNotice) + r.Get("/config/roles/", api.getRoles) + r.Post("/config/users/", api.createUser) + r.Put("/config/users/", api.createUser) + r.Get("/config/users/", api.getUsers) + r.Delete("/config/users/", api.deleteUser) + r.Post("/config/user/{id}", api.updateUser) + r.Post("/config/notice/", api.editNotice) } } From 624746f34b25f0366c65408e30efb762b7fa97c1 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Sat, 7 Feb 2026 18:29:27 +0100 Subject: [PATCH 04/31] Fix 404 handler route --- cmd/cc-backend/server.go | 36 +++++++++++++++++++++-------- web/templates/404.tmpl | 49 ++++++++++++++++++++++++++++++++-------- web/web.go | 14 ++++++++++++ 3 files changed, 79 insertions(+), 20 deletions(-) diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index 7e2b3ed3..250d4860 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -279,8 +279,8 @@ func (s *Server) init() error { } } - // Custom 404 handler for unmatched routes - s.router.NotFound(func(rw http.ResponseWriter, r *http.Request) { + // 404 handler for pages and API routes + notFoundHandler := func(rw http.ResponseWriter, r *http.Request) { if strings.HasPrefix(r.URL.Path, "/api/") || strings.HasPrefix(r.URL.Path, "/userapi/") || strings.HasPrefix(r.URL.Path, "/frontend/") || strings.HasPrefix(r.URL.Path, "/config/") { rw.Header().Set("Content-Type", "application/json") @@ -291,14 +291,13 @@ func (s *Server) init() error { }) return } + rw.Header().Set("Content-Type", "text/html; charset=utf-8") rw.WriteHeader(http.StatusNotFound) - web.RenderTemplate(rw, "message.tmpl", &web.Page{ - Title: "Not Found", - MsgType: "alert-warning", - Message: "The requested page was not found.", - Build: buildInfo, + web.RenderTemplate(rw, "404.tmpl", &web.Page{ + Title: "Page Not Found", + Build: buildInfo, }) - }) + } if config.Keys.EmbedStaticFiles { if i, err := os.Stat("./var/img"); err == nil { @@ -307,9 +306,26 @@ func (s *Server) init() error { s.router.Handle("/img/*", http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img")))) } } - s.router.Handle("/*", http.StripPrefix("/", web.ServeFiles())) + fileServer := http.StripPrefix("/", web.ServeFiles()) + s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + if web.StaticFileExists(r.URL.Path) { + fileServer.ServeHTTP(rw, r) + return + } + notFoundHandler(rw, r) + })) } else { - s.router.Handle("/*", http.FileServer(http.Dir(config.Keys.StaticFiles))) + staticDir := http.Dir(config.Keys.StaticFiles) + fileServer := http.FileServer(staticDir) + s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + f, err := staticDir.Open(r.URL.Path) + if err == nil { + f.Close() + fileServer.ServeHTTP(rw, r) + return + } + notFoundHandler(rw, r) + })) } return nil diff --git a/web/templates/404.tmpl b/web/templates/404.tmpl index 1bddd58b..b27599d0 100644 --- a/web/templates/404.tmpl +++ b/web/templates/404.tmpl @@ -1,10 +1,39 @@ -{{template "base.tmpl" .}} -{{define "content"}} -
-
- -
-
-{{end}} + + + + + + {{.Title}} + + + + +
+ +
+
+
+
+
+
+
+
+

404

+

Page Not Found

+

The page you are looking for does not exist or has been moved.

+ Back to Home +
+
+
+
+
+
+
+ + diff --git a/web/web.go b/web/web.go index f3185abc..d24e8fc7 100644 --- a/web/web.go +++ b/web/web.go @@ -186,6 +186,16 @@ func ServeFiles() http.Handler { return http.FileServer(http.FS(publicFiles)) } +// StaticFileExists checks whether a static file exists in the embedded frontend FS. +func StaticFileExists(path string) bool { + path = strings.TrimPrefix(path, "/") + if path == "" { + return false + } + _, err := fs.Stat(frontendFiles, "frontend/public/"+path) + return err == nil +} + //go:embed templates/* var templateFiles embed.FS @@ -201,6 +211,10 @@ func init() { return nil } + if path == "templates/404.tmpl" { + templates[strings.TrimPrefix(path, "templates/")] = template.Must(template.ParseFS(templateFiles, path)) + return nil + } if path == "templates/login.tmpl" { if util.CheckFileExists("./var/login.tmpl") { cclog.Info("overwrite login.tmpl with local file") From c7b366f35ff862225efd4c978fccd1f6c0d047b0 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 9 Feb 2026 07:46:37 +0100 Subject: [PATCH 05/31] Put notFoundHandler earlier to also catch subrouters --- cmd/cc-backend/server.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index 4035c430..35be2e85 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -256,6 +256,10 @@ func (s *Server) init() error { s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi) + // Set NotFound on the router so chi uses it for all unmatched routes, + // including those under subrouters like /api, /userapi, /frontend, etc. + s.router.NotFound(notFoundHandler) + if config.Keys.EmbedStaticFiles { if i, err := os.Stat("./var/img"); err == nil { if i.IsDir() { From fd9b76c6a74af3b21c783fc52fadf31795e2651d Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 9 Feb 2026 09:12:06 +0100 Subject: [PATCH 06/31] Security hardening of ldap and oicd auth implementations --- internal/auth/auth.go | 5 ++ internal/auth/ldap.go | 126 ++++++++++++++++++++-------------------- internal/auth/oidc.go | 119 ++++++++++++++++++++++++++++--------- internal/auth/schema.go | 8 +++ 4 files changed, 166 insertions(+), 92 deletions(-) diff --git a/internal/auth/auth.go b/internal/auth/auth.go index df618a3f..8a2073b5 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -294,6 +294,11 @@ func handleOIDCUser(OIDCUser *schema.User) { handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin) } +// handleLdapUser syncs LDAP user with database +func handleLdapUser(ldapUser *schema.User) { + handleUserSync(ldapUser, Keys.LdapConfig.SyncUserOnLogin, Keys.LdapConfig.UpdateUserOnLogin) +} + func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error { session, err := auth.sessionStore.New(r, "session") if err != nil { diff --git a/internal/auth/ldap.go b/internal/auth/ldap.go index 5e12f07b..831568d9 100644 --- a/internal/auth/ldap.go +++ b/internal/auth/ldap.go @@ -6,11 +6,12 @@ package auth import ( - "errors" "fmt" + "net" "net/http" "os" "strings" + "time" "github.com/ClusterCockpit/cc-backend/internal/repository" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" @@ -25,16 +26,19 @@ type LdapConfig struct { UserBind string `json:"user-bind"` UserFilter string `json:"user-filter"` UserAttr string `json:"username-attr"` + UidAttr string `json:"uid-attr"` SyncInterval string `json:"sync-interval"` // Parsed using time.ParseDuration. SyncDelOldUsers bool `json:"sync-del-old-users"` - // Should an non-existent user be added to the DB if user exists in ldap directory - SyncUserOnLogin bool `json:"sync-user-on-login"` + // Should a non-existent user be added to the DB if user exists in ldap directory + SyncUserOnLogin bool `json:"sync-user-on-login"` + UpdateUserOnLogin bool `json:"update-user-on-login"` } type LdapAuthenticator struct { syncPassword string UserAttr string + UidAttr string } var _ Authenticator = (*LdapAuthenticator)(nil) @@ -51,6 +55,12 @@ func (la *LdapAuthenticator) Init() error { la.UserAttr = "gecos" } + if Keys.LdapConfig.UidAttr != "" { + la.UidAttr = Keys.LdapConfig.UidAttr + } else { + la.UidAttr = "uid" + } + return nil } @@ -66,55 +76,44 @@ func (la *LdapAuthenticator) CanLogin( if user.AuthSource == schema.AuthViaLDAP { return user, true } - } else { - if lc.SyncUserOnLogin { - l, err := la.getLdapConnection(true) - if err != nil { - cclog.Error("LDAP connection error") - return nil, false - } - defer l.Close() - - // Search for the given username - searchRequest := ldap.NewSearchRequest( - lc.UserBase, - ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, - fmt.Sprintf("(&%s(uid=%s))", lc.UserFilter, username), - []string{"dn", "uid", la.UserAttr}, nil) - - sr, err := l.Search(searchRequest) - if err != nil { - cclog.Warn(err) - return nil, false - } - - if len(sr.Entries) != 1 { - cclog.Warn("LDAP: User does not exist or too many entries returned") - return nil, false - } - - entry := sr.Entries[0] - name := entry.GetAttributeValue(la.UserAttr) - var roles []string - roles = append(roles, schema.GetRoleString(schema.RoleUser)) - projects := make([]string, 0) - - user = &schema.User{ - Username: username, - Name: name, - Roles: roles, - Projects: projects, - AuthType: schema.AuthSession, - AuthSource: schema.AuthViaLDAP, - } - - if err := repository.GetUserRepository().AddUser(user); err != nil { - cclog.Errorf("User '%s' LDAP: Insert into DB failed", username) - return nil, false - } - - return user, true + } else if lc.SyncUserOnLogin { + l, err := la.getLdapConnection(true) + if err != nil { + cclog.Error("LDAP connection error") + return nil, false } + defer l.Close() + + // Search for the given username + searchRequest := ldap.NewSearchRequest( + lc.UserBase, + ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, + fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UidAttr, ldap.EscapeFilter(username)), + []string{"dn", la.UidAttr, la.UserAttr}, nil) + + sr, err := l.Search(searchRequest) + if err != nil { + cclog.Warn(err) + return nil, false + } + + if len(sr.Entries) != 1 { + cclog.Warn("LDAP: User does not exist or too many entries returned") + return nil, false + } + + entry := sr.Entries[0] + user = &schema.User{ + Username: username, + Name: entry.GetAttributeValue(la.UserAttr), + Roles: []string{schema.GetRoleString(schema.RoleUser)}, + Projects: make([]string, 0), + AuthType: schema.AuthSession, + AuthSource: schema.AuthViaLDAP, + } + + handleLdapUser(user) + return user, true } return nil, false @@ -132,7 +131,7 @@ func (la *LdapAuthenticator) Login( } defer l.Close() - userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username) + userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", ldap.EscapeDN(user.Username)) if err := l.Bind(userDn, r.FormValue("password")); err != nil { cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v", user.Username, err) @@ -170,7 +169,7 @@ func (la *LdapAuthenticator) Sync() error { lc.UserBase, ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, lc.UserFilter, - []string{"dn", "uid", la.UserAttr}, nil)) + []string{"dn", la.UidAttr, la.UserAttr}, nil)) if err != nil { cclog.Warn("LDAP search error") return err @@ -178,9 +177,9 @@ func (la *LdapAuthenticator) Sync() error { newnames := map[string]string{} for _, entry := range ldapResults.Entries { - username := entry.GetAttributeValue("uid") + username := entry.GetAttributeValue(la.UidAttr) if username == "" { - return errors.New("no attribute 'uid'") + return fmt.Errorf("no attribute '%s'", la.UidAttr) } _, ok := users[username] @@ -194,20 +193,19 @@ func (la *LdapAuthenticator) Sync() error { for username, where := range users { if where == InDB && lc.SyncDelOldUsers { - ur.DelUser(username) + if err := ur.DelUser(username); err != nil { + cclog.Errorf("User '%s' LDAP: Delete from DB failed: %v", username, err) + return err + } cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username) } else if where == InLdap { name := newnames[username] - var roles []string - roles = append(roles, schema.GetRoleString(schema.RoleUser)) - projects := make([]string, 0) - user := &schema.User{ Username: username, Name: name, - Roles: roles, - Projects: projects, + Roles: []string{schema.GetRoleString(schema.RoleUser)}, + Projects: make([]string, 0), AuthSource: schema.AuthViaLDAP, } @@ -224,11 +222,13 @@ func (la *LdapAuthenticator) Sync() error { func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) { lc := Keys.LdapConfig - conn, err := ldap.DialURL(lc.URL) + conn, err := ldap.DialURL(lc.URL, + ldap.DialWithDialer(&net.Dialer{Timeout: 10 * time.Second})) if err != nil { cclog.Warn("LDAP URL dial failed") return nil, err } + conn.SetTimeout(30 * time.Second) if admin { if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil { diff --git a/internal/auth/oidc.go b/internal/auth/oidc.go index f81b651f..ec6c77a7 100644 --- a/internal/auth/oidc.go +++ b/internal/auth/oidc.go @@ -9,6 +9,7 @@ import ( "context" "crypto/rand" "encoding/base64" + "fmt" "io" "net/http" "os" @@ -50,6 +51,7 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin MaxAge: int(time.Hour.Seconds()), Secure: r.TLS != nil, HttpOnly: true, + SameSite: http.SameSiteLaxMode, } http.SetCookie(w, c) } @@ -77,8 +79,7 @@ func NewOIDC(a *Authentication) *OIDC { ClientID: clientID, ClientSecret: clientSecret, Endpoint: provider.Endpoint(), - RedirectURL: "oidc-callback", - Scopes: []string{oidc.ScopeOpenID, "profile", "email"}, + Scopes: []string{oidc.ScopeOpenID, "profile"}, } oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a} @@ -122,54 +123,93 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) { token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier)) if err != nil { - http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError) + cclog.Errorf("token exchange failed: %s", err.Error()) + http.Error(rw, "Authentication failed during token exchange", http.StatusInternalServerError) return } // Get user info from OIDC provider with same timeout userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token)) if err != nil { - http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError) + cclog.Errorf("failed to get userinfo: %s", err.Error()) + http.Error(rw, "Failed to retrieve user information", http.StatusInternalServerError) return } - // // Extract the ID Token from OAuth2 token. - // rawIDToken, ok := token.Extra("id_token").(string) - // if !ok { - // http.Error(rw, "Cannot access idToken", http.StatusInternalServerError) - // } - // - // verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID}) - // // Parse and verify ID Token payload. - // idToken, err := verifier.Verify(context.Background(), rawIDToken) - // if err != nil { - // http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError) - // } + // Verify ID token and nonce to prevent replay attacks + rawIDToken, ok := token.Extra("id_token").(string) + if !ok { + http.Error(rw, "ID token not found in response", http.StatusInternalServerError) + return + } + + nonceCookie, err := r.Cookie("nonce") + if err != nil { + http.Error(rw, "nonce cookie not found", http.StatusBadRequest) + return + } + + verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID}) + idToken, err := verifier.Verify(ctx, rawIDToken) + if err != nil { + cclog.Errorf("ID token verification failed: %s", err.Error()) + http.Error(rw, "ID token verification failed", http.StatusInternalServerError) + return + } + + if idToken.Nonce != nonceCookie.Value { + http.Error(rw, "Nonce mismatch", http.StatusBadRequest) + return + } projects := make([]string, 0) - // Extract custom claims + // Extract custom claims from userinfo var claims struct { Username string `json:"preferred_username"` Name string `json:"name"` - Profile struct { + // Keycloak realm-level roles + RealmAccess struct { + Roles []string `json:"roles"` + } `json:"realm_access"` + // Keycloak client-level roles + ResourceAccess struct { Client struct { Roles []string `json:"roles"` } `json:"clustercockpit"` } `json:"resource_access"` } if err := userInfo.Claims(&claims); err != nil { - http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError) + cclog.Errorf("failed to extract claims: %s", err.Error()) + http.Error(rw, "Failed to extract user claims", http.StatusInternalServerError) + return + } + + if claims.Username == "" { + http.Error(rw, "Username claim missing from OIDC provider", http.StatusBadRequest) + return + } + + // Merge roles from both client-level and realm-level access + oidcRoles := append(claims.ResourceAccess.Client.Roles, claims.RealmAccess.Roles...) + + roleSet := make(map[string]bool) + for _, r := range oidcRoles { + switch r { + case "user": + roleSet[schema.GetRoleString(schema.RoleUser)] = true + case "admin": + roleSet[schema.GetRoleString(schema.RoleAdmin)] = true + case "manager": + roleSet[schema.GetRoleString(schema.RoleManager)] = true + case "support": + roleSet[schema.GetRoleString(schema.RoleSupport)] = true + } } var roles []string - for _, r := range claims.Profile.Client.Roles { - switch r { - case "user": - roles = append(roles, schema.GetRoleString(schema.RoleUser)) - case "admin": - roles = append(roles, schema.GetRoleString(schema.RoleAdmin)) - } + for role := range roleSet { + roles = append(roles, role) } if len(roles) == 0 { @@ -188,8 +228,12 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) { handleOIDCUser(user) } - oa.authentication.SaveSession(rw, r, user) - cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects) + if err := oa.authentication.SaveSession(rw, r, user); err != nil { + cclog.Errorf("session save failed for user %q: %s", user.Username, err.Error()) + http.Error(rw, "Failed to create session", http.StatusInternalServerError) + return + } + cclog.Infof("login successful: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects) userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user) http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx)) } @@ -206,7 +250,24 @@ func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) { codeVerifier := oauth2.GenerateVerifier() setCallbackCookie(rw, r, "verifier", codeVerifier) + // Generate nonce for ID token replay protection + nonce, err := randString(16) + if err != nil { + http.Error(rw, "Internal error", http.StatusInternalServerError) + return + } + setCallbackCookie(rw, r, "nonce", nonce) + + // Build redirect URL from the incoming request + scheme := "https" + if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" { + scheme = "http" + } + oa.client.RedirectURL = fmt.Sprintf("%s://%s/oidc-callback", scheme, r.Host) + // Redirect user to consent page to ask for permission - url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier)) + url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, + oauth2.S256ChallengeOption(codeVerifier), + oidc.Nonce(nonce)) http.Redirect(rw, r, url, http.StatusFound) } diff --git a/internal/auth/schema.go b/internal/auth/schema.go index 496e899b..b6ee0702 100644 --- a/internal/auth/schema.go +++ b/internal/auth/schema.go @@ -92,9 +92,17 @@ var configSchema = ` "description": "Delete obsolete users in database.", "type": "boolean" }, + "uid-attr": { + "description": "LDAP attribute used as login username. Default: uid", + "type": "string" + }, "sync-user-on-login": { "description": "Add non-existent user to DB at login attempt if user exists in Ldap directory", "type": "boolean" + }, + "update-user-on-login": { + "description": "Should an existent user attributes in the DB be updated at login attempt with values from LDAP.", + "type": "boolean" } }, "required": ["url", "user-base", "search-dn", "user-bind", "user-filter"] From 1980ef5f43c405ee01724dcd1f631fb482c1e757 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 9 Feb 2026 09:17:01 +0100 Subject: [PATCH 07/31] Renaming due to linter complaints --- internal/auth/ldap.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/internal/auth/ldap.go b/internal/auth/ldap.go index 831568d9..a174bb9d 100644 --- a/internal/auth/ldap.go +++ b/internal/auth/ldap.go @@ -26,7 +26,7 @@ type LdapConfig struct { UserBind string `json:"user-bind"` UserFilter string `json:"user-filter"` UserAttr string `json:"username-attr"` - UidAttr string `json:"uid-attr"` + UIDAttr string `json:"uid-attr"` SyncInterval string `json:"sync-interval"` // Parsed using time.ParseDuration. SyncDelOldUsers bool `json:"sync-del-old-users"` @@ -38,7 +38,7 @@ type LdapConfig struct { type LdapAuthenticator struct { syncPassword string UserAttr string - UidAttr string + UIDAttr string } var _ Authenticator = (*LdapAuthenticator)(nil) @@ -55,10 +55,10 @@ func (la *LdapAuthenticator) Init() error { la.UserAttr = "gecos" } - if Keys.LdapConfig.UidAttr != "" { - la.UidAttr = Keys.LdapConfig.UidAttr + if Keys.LdapConfig.UIDAttr != "" { + la.UIDAttr = Keys.LdapConfig.UIDAttr } else { - la.UidAttr = "uid" + la.UIDAttr = "uid" } return nil @@ -88,8 +88,8 @@ func (la *LdapAuthenticator) CanLogin( searchRequest := ldap.NewSearchRequest( lc.UserBase, ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, - fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UidAttr, ldap.EscapeFilter(username)), - []string{"dn", la.UidAttr, la.UserAttr}, nil) + fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UIDAttr, ldap.EscapeFilter(username)), + []string{"dn", la.UIDAttr, la.UserAttr}, nil) sr, err := l.Search(searchRequest) if err != nil { @@ -169,7 +169,7 @@ func (la *LdapAuthenticator) Sync() error { lc.UserBase, ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, lc.UserFilter, - []string{"dn", la.UidAttr, la.UserAttr}, nil)) + []string{"dn", la.UIDAttr, la.UserAttr}, nil)) if err != nil { cclog.Warn("LDAP search error") return err @@ -177,9 +177,9 @@ func (la *LdapAuthenticator) Sync() error { newnames := map[string]string{} for _, entry := range ldapResults.Entries { - username := entry.GetAttributeValue(la.UidAttr) + username := entry.GetAttributeValue(la.UIDAttr) if username == "" { - return fmt.Errorf("no attribute '%s'", la.UidAttr) + return fmt.Errorf("no attribute '%s'", la.UIDAttr) } _, ok := users[username] From 7dd3ee3084114739b0a1bf8fadd174253593ea7b Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 9 Feb 2026 12:23:21 +0100 Subject: [PATCH 08/31] remove undocumented minRunningFor filter, add short jobs quick selection instead --- web/frontend/src/Jobs.root.svelte | 2 + web/frontend/src/Node.root.svelte | 2 +- web/frontend/src/User.root.svelte | 3 +- web/frontend/src/generic/Filters.svelte | 44 +++++++++++++++++++ web/frontend/src/generic/JobCompare.svelte | 6 +-- web/frontend/src/generic/JobList.svelte | 4 -- .../src/generic/filters/StartTime.svelte | 8 ++-- 7 files changed, 54 insertions(+), 15 deletions(-) diff --git a/web/frontend/src/Jobs.root.svelte b/web/frontend/src/Jobs.root.svelte index 52efca6b..0d543fc8 100644 --- a/web/frontend/src/Jobs.root.svelte +++ b/web/frontend/src/Jobs.root.svelte @@ -142,6 +142,8 @@ { diff --git a/web/frontend/src/Node.root.svelte b/web/frontend/src/Node.root.svelte index 6962aff8..2e61ea14 100644 --- a/web/frontend/src/Node.root.svelte +++ b/web/frontend/src/Node.root.svelte @@ -119,7 +119,7 @@ const filter = $derived([ { cluster: { eq: cluster } }, - { node: { contains: hostname } }, + { node: { eq: hostname } }, { state: ["running"] }, ]); diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index 76c9c97a..4ee3f892 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -219,9 +219,10 @@ { jobFilters = [...detail.filters, { user: { eq: user.username } }]; selectedCluster = jobFilters[0]?.cluster diff --git a/web/frontend/src/generic/Filters.svelte b/web/frontend/src/generic/Filters.svelte index 74f55ca7..5a8bcf23 100644 --- a/web/frontend/src/generic/Filters.svelte +++ b/web/frontend/src/generic/Filters.svelte @@ -6,6 +6,8 @@ - `filterPresets Object?`: Optional predefined filter values [Default: {}] - `disableClusterSelection Bool?`: Is the selection disabled [Default: false] - `startTimeQuickSelect Bool?`: Render startTime quick selections [Default: false] + - `shortJobQuickSelect Bool?`: Render short job quick selections [Default: false] + - `shortJobCutoff Int?`: Time in seconds for jobs to be considered short [Default: null] - `matchedJobs Number?`: Number of jobs matching the filter [Default: -2] - `showFilter Func`: If the filter component should be rendered in addition to total count info [Default: true] - `applyFilters Func`: The callback function to apply current filter selection @@ -25,6 +27,7 @@ ButtonGroup, ButtonDropdown, Icon, + Tooltip } from "@sveltestrap/sveltestrap"; import Info from "./filters/InfoBox.svelte"; import Cluster from "./filters/Cluster.svelte"; @@ -36,6 +39,7 @@ import Resources from "./filters/Resources.svelte"; import Energy from "./filters/Energy.svelte"; import Statistics from "./filters/Stats.svelte"; + import { formatDurationTime } from "./units.js"; /* Svelte 5 Props */ let { @@ -43,6 +47,8 @@ filterPresets = {}, disableClusterSelection = false, startTimeQuickSelect = false, + shortJobQuickSelect = false, + shortJobCutoff = 0, matchedJobs = -2, showFilter = true, applyFilters @@ -335,6 +341,44 @@ (isStatsOpen = true)}> (isStatsOpen = true)} /> Statistics + {#if shortJobQuickSelect && shortJobCutoff > 0} + + + Short Jobs Selection + + + Job duration less than {formatDurationTime(shortJobCutoff)} + + + { + filters.duration = { + moreThan: null, + lessThan: shortJobCutoff, + from: null, + to: null + } + updateFilters(); + }} + > + + Only Short Jobs + + { + filters.duration = { + moreThan: shortJobCutoff, + lessThan: null, + from: null, + to: null + } + updateFilters(); + }} + > + + Exclude Short Jobs + + {/if} {#if startTimeQuickSelect} Start Time Quick Selection diff --git a/web/frontend/src/generic/JobCompare.svelte b/web/frontend/src/generic/JobCompare.svelte index d5283a9a..dfe548b0 100644 --- a/web/frontend/src/generic/JobCompare.svelte +++ b/web/frontend/src/generic/JobCompare.svelte @@ -112,11 +112,7 @@ // (Re-)query and optionally set new filters; Query will be started reactively. export function queryJobs(filters) { if (filters != null) { - let minRunningFor = ccconfig.jobList_hideShortRunningJobs; - if (minRunningFor && minRunningFor > 0) { - filters.push({ minRunningFor }); - } - filter = filters; + filter = [...filters]; } } diff --git a/web/frontend/src/generic/JobList.svelte b/web/frontend/src/generic/JobList.svelte index 9394ed5f..278f189e 100644 --- a/web/frontend/src/generic/JobList.svelte +++ b/web/frontend/src/generic/JobList.svelte @@ -180,10 +180,6 @@ // (Re-)query and optionally set new filters; Query will be started reactively. export function queryJobs(filters) { if (filters != null) { - let minRunningFor = ccconfig.jobList_hideShortRunningJobs; - if (minRunningFor && minRunningFor > 0) { - filters.push({ minRunningFor }); - } filter = [...filters]; } }; diff --git a/web/frontend/src/generic/filters/StartTime.svelte b/web/frontend/src/generic/filters/StartTime.svelte index 5d9340e3..2eceaf6e 100644 --- a/web/frontend/src/generic/filters/StartTime.svelte +++ b/web/frontend/src/generic/filters/StartTime.svelte @@ -14,10 +14,10 @@ From d1e7ea09bcf04a1c862ea9cf6f5210ac56146da5 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 9 Feb 2026 15:33:59 +0100 Subject: [PATCH 09/31] review handling of disabled metrics in frontend --- web/frontend/src/Job.root.svelte | 23 +-- web/frontend/src/Node.root.svelte | 15 +- .../src/generic/joblist/JobListRow.svelte | 109 ++++++------ web/frontend/src/systems/NodeOverview.svelte | 76 ++++---- .../src/systems/nodelist/NodeListRow.svelte | 162 ++++++++---------- 5 files changed, 189 insertions(+), 196 deletions(-) diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 50de27b5..99dfa7ac 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -333,7 +333,18 @@ {:else if thisJob && $jobMetrics?.data?.scopedJobStats} {#snippet gridContent(item)} - {#if item.data} + {#if item?.disabled} + + + Disabled Metric + + +

No dataset(s) returned for {item.metric}

+

Metric has been disabled for subcluster {thisJob.subCluster}.

+

To remove this card, open metric selection, de-select the metric, and press "Close and Apply".

+
+
+ {:else if item?.data} x.scope)} isShared={thisJob.shared != "none"} /> - {:else if item.disabled == true} - - - Disabled Metric - - -

Metric {item.metric} is disabled for cluster {thisJob.cluster}:{thisJob.subCluster}.

-

To remove this card, open metric selection and press "Close and Apply".

-
-
{:else} diff --git a/web/frontend/src/Node.root.svelte b/web/frontend/src/Node.root.svelte index 2e61ea14..d3364b49 100644 --- a/web/frontend/src/Node.root.svelte +++ b/web/frontend/src/Node.root.svelte @@ -253,12 +253,15 @@ forNode /> {:else if item.disabled === true && item.metric} - Metric disabled for subcluster {item.name}:{$nodeMetricsData.data.nodeMetrics[0] - .subCluster} + + + Disabled Metric + + +

No dataset(s) returned for {item.name}

+

Metric has been disabled for subcluster {$nodeMetricsData.data.nodeMetrics[0].subCluster}.

+
+
{:else} diff --git a/web/frontend/src/generic/joblist/JobListRow.svelte b/web/frontend/src/generic/joblist/JobListRow.svelte index 353e0827..5d129ad0 100644 --- a/web/frontend/src/generic/joblist/JobListRow.svelte +++ b/web/frontend/src/generic/joblist/JobListRow.svelte @@ -99,7 +99,7 @@ }) ); - const refinedData = $derived($metricsQuery?.data?.jobMetrics ? sortAndSelectScope($metricsQuery.data.jobMetrics) : []); + const refinedData = $derived($metricsQuery?.data?.jobMetrics ? sortAndSelectScope(metrics, $metricsQuery.data.jobMetrics) : []); /* Effects */ $effect(() => { @@ -140,6 +140,26 @@ }); } + function sortAndSelectScope(metricList = [], jobMetrics = []) { + const pendingData = []; + metricList.forEach((metricName) => { + const pendingMetric = { + name: metricName, + disabled: checkMetricDisabled( + globalMetrics, + metricName, + job.cluster, + job.subCluster, + ), + data: null + }; + const scopesData = jobMetrics.filter((jobMetric) => jobMetric.name == metricName) + if (scopesData.length > 0) pendingMetric.data = selectScope(scopesData) + pendingData.push(pendingMetric) + }); + return pendingData; + }; + const selectScope = (jobMetrics) => jobMetrics.reduce( (a, b) => @@ -152,30 +172,6 @@ : a, jobMetrics[0], ); - - const sortAndSelectScope = (jobMetrics) => - metrics - .map((name) => jobMetrics.filter((jobMetric) => jobMetric.name == name)) - .map((jobMetrics) => ({ - disabled: false, - data: jobMetrics.length > 0 ? selectScope(jobMetrics) : null, - })) - .map((jobMetric) => { - if (jobMetric.data) { - return { - name: jobMetric.data.name, - disabled: checkMetricDisabled( - globalMetrics, - jobMetric.data.name, - job.cluster, - job.subCluster, - ), - data: jobMetric.data, - }; - } else { - return jobMetric; - } - }); @@ -211,39 +207,36 @@ {/if} {#each refinedData as metric, i (metric?.name || i)} - {#key metric} - {#if metric?.data} - {#if metric?.disabled} - - Metric {metric.data.name}: Disabled for subcluster {job.subCluster} - - {:else} - handleZoom(detail, metric.data.name)} - height={plotHeight} - timestep={metric.data.metric.timestep} - scope={metric.data.scope} - series={metric.data.metric.series} - statisticsSeries={metric.data.metric.statisticsSeries} - metric={metric.data.name} - cluster={clusterInfos.find((c) => c.name == job.cluster)} - subCluster={job.subCluster} - isShared={job.shared != "none"} - numhwthreads={job.numHWThreads} - numaccs={job.numAcc} - zoomState={zoomStates[metric.data.name] || null} - thresholdState={thresholdStates[metric.data.name] || null} - /> - {/if} - {:else} - -

No dataset(s) returned for {metrics[i]}

-

Metric or host was not found in metric store for cluster {job.cluster}:

-

Identical messages in {metrics[i]} column: Metric not found.

-

Identical messages in job {job.jobId} row: Host not found.

-
- {/if} - {/key} + {#if metric?.disabled} + +

No dataset(s) returned for {metrics[i]}

+

Metric has been disabled for subcluster {job.subCluster}.

+
+ {:else if metric?.data} + handleZoom(detail, metric.data.name)} + height={plotHeight} + timestep={metric.data.metric.timestep} + scope={metric.data.scope} + series={metric.data.metric.series} + statisticsSeries={metric.data.metric.statisticsSeries} + metric={metric.data.name} + cluster={clusterInfos.find((c) => c.name == job.cluster)} + subCluster={job.subCluster} + isShared={job.shared != "none"} + numhwthreads={job.numHWThreads} + numaccs={job.numAcc} + zoomState={zoomStates[metric.data.name] || null} + thresholdState={thresholdStates[metric.data.name] || null} + /> + {:else} + +

No dataset(s) returned for {metrics[i]}

+

Metric or host was not found in metric store for cluster {job.cluster}:

+

Identical messages in {metrics[i]} column: Metric not found.

+

Identical messages in job {job.jobId} row: Host not found.

+
+ {/if} {:else} diff --git a/web/frontend/src/systems/NodeOverview.svelte b/web/frontend/src/systems/NodeOverview.svelte index fd463600..a34fdf0f 100644 --- a/web/frontend/src/systems/NodeOverview.svelte +++ b/web/frontend/src/systems/NodeOverview.svelte @@ -110,7 +110,7 @@ }; }); }; - + let pendingMapped = []; if (rawData.length > 0) { pendingMapped = rawData.map((h) => ({ @@ -120,12 +120,11 @@ data: h.metrics.filter( (m) => m?.name == selectedMetric && m.scope == "node", ), - // TODO: Move To New Func Variant With Disabled Check on WHole Cluster Level: This never Triggers! disabled: checkMetricDisabled(globalMetrics, selectedMetric, cluster, h.subCluster), })) .sort((a, b) => a.host.localeCompare(b.host)) } - + return pendingMapped; } @@ -162,35 +161,32 @@ - {#if item?.data} - {#if item.disabled === true} - - Metric disabled for subcluster {selectedMetric}:{item.subCluster} - {:else if item.disabled === false} - - - {#key item.data[0].metric.series[0].data.length} - - {/key} - {:else} - - Global Metric List Not Initialized - Can not determine {selectedMetric} availability: Please Reload Page - - {/if} + {#if item?.disabled} + + + Disabled Metric + + +

No dataset(s) returned for {selectedMetric}

+

Metric has been disabled for subcluster {item.subCluster}.

+
+
+ {:else if item?.data} + + + {#key item.data[0].metric.series[0].data.length} + + {/key} {:else} + Missing Metric @@ -205,10 +201,22 @@ {/each} {/key} +{:else if hostnameFilter || hoststateFilter != 'all'} + + + + Empty Filter Return + + +

No datasets returned for {selectedMetric}.

+

Hostname filter and/or host state filter returned no matches.

+
+
+
{:else} - - - + + + Missing Metric diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index 2abe0b41..e091769b 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -72,10 +72,30 @@ ); const extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null); - const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(nodeData.metrics) : []); + const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(selectedMetrics, nodeData.metrics) : []); const dataHealth = $derived(refinedData.filter((rd) => rd.disabled === false).map((enabled) => (enabled?.data?.metric?.series?.length > 0))); /* Functions */ + function sortAndSelectScope(metricList = [], nodeMetrics = []) { + const pendingData = []; + metricList.forEach((metricName) => { + const pendingMetric = { + name: metricName, + disabled: checkMetricDisabled( + globalMetrics, + metricName, + cluster, + nodeData.subCluster, + ), + data: null + }; + const scopesData = nodeMetrics.filter((nodeMetric) => nodeMetric.name == metricName) + if (scopesData.length > 0) pendingMetric.data = selectScope(scopesData) + pendingData.push(pendingMetric) + }); + return pendingData; + }; + const selectScope = (nodeMetrics) => nodeMetrics.reduce( (a, b) => @@ -83,29 +103,6 @@ nodeMetrics[0], ); - const sortAndSelectScope = (allNodeMetrics) => - selectedMetrics - .map((selectedName) => allNodeMetrics.filter((nodeMetric) => nodeMetric.name == selectedName)) - .map((matchedNodeMetrics) => ({ - disabled: false, - data: matchedNodeMetrics.length > 0 ? selectScope(matchedNodeMetrics) : null, - })) - .map((scopedNodeMetric) => { - if (scopedNodeMetric?.data) { - return { - disabled: checkMetricDisabled( - globalMetrics, - scopedNodeMetric.data.name, - cluster, - nodeData.subCluster, - ), - data: scopedNodeMetric.data, - }; - } else { - return scopedNodeMetric; - } - }); - function buildExtendedLegend() { let pendingExtendedLegendData = null // Build Extended for allocated nodes [Commented: Only Build extended Legend For Shared Nodes] @@ -171,68 +168,59 @@ {/if} {#each refinedData as metricData, i (metricData?.data?.name || i)} - {#key metricData} - - {#if metricData?.disabled} - Metric {selectedMetrics[i]} disabled for subcluster {nodeData.subCluster} - {:else if !metricData?.data} - -

No dataset(s) returned for {selectedMetrics[i]}

-

Metric was not found in metric store for cluster {cluster}.

-
- {:else if !metricData?.data?.name} - Metric without name for subcluster {`Metric Index ${i}`}:{nodeData.subCluster} - {:else if !!metricData.data?.metric.statisticsSeries} - - -
- {#key extendedLegendData} - - {/key} - {:else} - - {/if} - - {/key} + + {#if metricData?.disabled} + +

No dataset(s) returned for {selectedMetrics[i]}

+

Metric has been disabled for subcluster {nodeData.subCluster}.

+
+ {:else if !metricData?.data} + +

No dataset(s) returned for {selectedMetrics[i]}

+

Metric was not found in metric store for cluster {cluster}.

+
+ {:else if !!metricData.data?.metric.statisticsSeries} + + +
+ {#key extendedLegendData} + + {/key} + {:else} + + {/if} + {/each} From 51e9d33f9f6abfc73344c07aa9708df187ee8f36 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 9 Feb 2026 17:21:49 +0100 Subject: [PATCH 10/31] fix empty availability print case --- .../src/generic/select/MetricSelection.svelte | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/web/frontend/src/generic/select/MetricSelection.svelte b/web/frontend/src/generic/select/MetricSelection.svelte index dcefa56d..8234b32c 100644 --- a/web/frontend/src/generic/select/MetricSelection.svelte +++ b/web/frontend/src/generic/select/MetricSelection.svelte @@ -88,16 +88,19 @@ function printAvailability(metric, cluster) { const avail = globalMetrics.find((gm) => gm.name === metric)?.availability - if (!cluster) { - return avail.map((av) => av.cluster).join(', ') - } else { - const subAvail = avail.find((av) => av.cluster === cluster)?.subClusters - if (subAvail) { - return subAvail.join(', ') + if (avail) { + if (!cluster) { + return avail.map((av) => av.cluster).join(', ') } else { - return `Not available for ${cluster}` + const subAvail = avail.find((av) => av.cluster === cluster)?.subClusters + if (subAvail) { + return subAvail.join(', ') + } else { + return `Not available for ${cluster}` + } } } + return "" } function columnsDragOver(event) { From ac7eb93141d081ca083e8e576e8bc61268f2671e Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 9 Feb 2026 19:57:46 +0100 Subject: [PATCH 11/31] fix: Transfer always to main job table before archiving --- internal/api/job.go | 22 +++++++--- internal/api/nats.go | 22 +++++++--- internal/repository/jobCreate.go | 43 +++++++++++-------- internal/repository/jobCreate_test.go | 62 ++++++++++++++------------- 4 files changed, 91 insertions(+), 58 deletions(-) diff --git a/internal/api/job.go b/internal/api/job.go index d67dbb93..9bd93b1c 100644 --- a/internal/api/job.go +++ b/internal/api/job.go @@ -754,6 +754,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { return } + isCached := false job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) if err != nil { // Try cached jobs if not found in main repository @@ -764,9 +765,10 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { return } job = cachedJob + isCached = true } - api.checkAndHandleStopJob(rw, job, req) + api.checkAndHandleStopJob(rw, job, req, isCached) } // deleteJobByID godoc @@ -923,7 +925,7 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { } } -func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest) { +func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest, isCached bool) { // Sanity checks if job.State != schema.JobStateRunning { handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, *job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw) @@ -948,11 +950,21 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo api.JobRepository.Mutex.Lock() defer api.JobRepository.Mutex.Unlock() - if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + // If the job is still in job_cache, transfer it to the job table first + // so that job.ID always points to the job table for downstream code + if isCached { + newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID) + if err != nil { + handleError(fmt.Errorf("jobId %d (id %d) on %s : transferring cached job failed: %w", job.JobID, *job.ID, job.Cluster, err), http.StatusInternalServerError, rw) return } + cclog.Infof("transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID) + job.ID = &newID + } + + if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + return } cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) diff --git a/internal/api/nats.go b/internal/api/nats.go index c0a8c174..0e929426 100644 --- a/internal/api/nats.go +++ b/internal/api/nats.go @@ -251,6 +251,7 @@ func (api *NatsAPI) handleStopJob(payload string) { return } + isCached := false job, err := api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) if err != nil { cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime) @@ -260,6 +261,7 @@ func (api *NatsAPI) handleStopJob(payload string) { return } job = cachedJob + isCached = true } if job.State != schema.JobStateRunning { @@ -287,16 +289,26 @@ func (api *NatsAPI) handleStopJob(payload string) { api.JobRepository.Mutex.Lock() defer api.JobRepository.Mutex.Unlock() - if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v", - job.JobID, job.ID, job.Cluster, job.State, err) + // If the job is still in job_cache, transfer it to the job table first + if isCached { + newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID) + if err != nil { + cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v", + job.JobID, *job.ID, job.Cluster, err) return } + cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID) + job.ID = &newID + } + + if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v", + job.JobID, *job.ID, job.Cluster, job.State, err) + return } cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", - job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) + *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) if job.MonitoringStatus == schema.MonitoringStatusDisabled { return diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index 6114ae5e..9f4f366d 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -71,8 +71,9 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { jobs = append(jobs, job) } + // Use INSERT OR IGNORE to skip jobs already transferred by the stop path _, err = r.DB.Exec( - "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") + "INSERT OR IGNORE INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") if err != nil { cclog.Warnf("Error while Job sync: %v", err) return nil, err @@ -87,6 +88,29 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { return jobs, nil } +// TransferCachedJobToMain moves a job from job_cache to the job table. +// Caller must hold r.Mutex. Returns the new job table ID. +func (r *JobRepository) TransferCachedJobToMain(cacheID int64) (int64, error) { + res, err := r.DB.Exec( + "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache WHERE id = ?", + cacheID) + if err != nil { + return 0, fmt.Errorf("transferring cached job %d to main table failed: %w", cacheID, err) + } + + newID, err := res.LastInsertId() + if err != nil { + return 0, fmt.Errorf("getting new job ID after transfer failed: %w", err) + } + + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", cacheID) + if err != nil { + return 0, fmt.Errorf("deleting cached job %d after transfer failed: %w", cacheID, err) + } + + return newID, nil +} + // Start inserts a new job in the table, returning the unique job ID. // Statistics are not transfered! func (r *JobRepository) Start(job *schema.Job) (id int64, err error) { @@ -129,20 +153,3 @@ func (r *JobRepository) Stop( return err } -func (r *JobRepository) StopCached( - jobID int64, - duration int32, - state schema.JobState, - monitoringStatus int32, -) (err error) { - // Note: StopCached updates job_cache table, not the main job table - // Cache invalidation happens when job is synced to main table - stmt := sq.Update("job_cache"). - Set("job_state", state). - Set("duration", duration). - Set("monitoring_status", monitoringStatus). - Where("job_cache.id = ?", jobID) - - _, err = stmt.RunWith(r.stmtCache).Exec() - return err -} diff --git a/internal/repository/jobCreate_test.go b/internal/repository/jobCreate_test.go index 3a586482..9e72555f 100644 --- a/internal/repository/jobCreate_test.go +++ b/internal/repository/jobCreate_test.go @@ -331,58 +331,60 @@ func TestStop(t *testing.T) { }) } -func TestStopCached(t *testing.T) { +func TestTransferCachedJobToMain(t *testing.T) { r := setup(t) - t.Run("successful stop cached job", func(t *testing.T) { + t.Run("successful transfer from cache to main", func(t *testing.T) { // Insert a job in job_cache job := createTestJob(999009, "testcluster") - id, err := r.Start(job) + cacheID, err := r.Start(job) require.NoError(t, err) - // Stop the cached job - duration := int32(3600) - state := schema.JobStateCompleted - monitoringStatus := int32(schema.MonitoringStatusArchivingSuccessful) + // Transfer the cached job to the main table + r.Mutex.Lock() + newID, err := r.TransferCachedJobToMain(cacheID) + r.Mutex.Unlock() + require.NoError(t, err, "TransferCachedJobToMain should succeed") + assert.NotEqual(t, cacheID, newID, "New ID should differ from cache ID") - err = r.StopCached(id, duration, state, monitoringStatus) - require.NoError(t, err, "StopCached should succeed") - - // Verify job was updated in job_cache table - var retrievedDuration int32 - var retrievedState string - var retrievedMonStatus int32 - err = r.DB.QueryRow(`SELECT duration, job_state, monitoring_status FROM job_cache WHERE id = ?`, id).Scan( - &retrievedDuration, &retrievedState, &retrievedMonStatus) + // Verify job exists in job table + var count int + err = r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE id = ?`, newID).Scan(&count) require.NoError(t, err) - assert.Equal(t, duration, retrievedDuration) - assert.Equal(t, string(state), retrievedState) - assert.Equal(t, monitoringStatus, retrievedMonStatus) + assert.Equal(t, 1, count, "Job should exist in main table") + + // Verify job was removed from job_cache + err = r.DB.QueryRow(`SELECT COUNT(*) FROM job_cache WHERE id = ?`, cacheID).Scan(&count) + require.NoError(t, err) + assert.Equal(t, 0, count, "Job should be removed from cache") // Clean up - _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID) require.NoError(t, err) }) - t.Run("stop cached job does not affect job table", func(t *testing.T) { + t.Run("transfer preserves job data", func(t *testing.T) { // Insert a job in job_cache job := createTestJob(999010, "testcluster") - id, err := r.Start(job) + cacheID, err := r.Start(job) require.NoError(t, err) - // Stop the cached job - err = r.StopCached(id, 3600, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)) + // Transfer the cached job + r.Mutex.Lock() + newID, err := r.TransferCachedJobToMain(cacheID) + r.Mutex.Unlock() require.NoError(t, err) - // Verify job table was not affected - var count int - err = r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE job_id = ? AND cluster = ?`, - job.JobID, job.Cluster).Scan(&count) + // Verify the transferred job has the correct data + var jobID int64 + var cluster string + err = r.DB.QueryRow(`SELECT job_id, cluster FROM job WHERE id = ?`, newID).Scan(&jobID, &cluster) require.NoError(t, err) - assert.Equal(t, 0, count, "Job table should not be affected by StopCached") + assert.Equal(t, job.JobID, jobID) + assert.Equal(t, job.Cluster, cluster) // Clean up - _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID) require.NoError(t, err) }) } From 035ac2384eb4be136b18308379c4b15ad194540c Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 9 Feb 2026 21:56:41 +0100 Subject: [PATCH 12/31] Refactor GlobalMetricLists --- pkg/archive/clusterConfig.go | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index 272eeb35..64851365 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -25,6 +25,7 @@ func initClusterConfig() error { GlobalUserMetricList = []*schema.GlobalMetricListItem{} NodeLists = map[string]map[string]NodeList{} metricLookup := make(map[string]schema.GlobalMetricListItem) + userMetricLookup := make(map[string]schema.GlobalMetricListItem) for _, c := range ar.GetClusters() { @@ -62,11 +63,12 @@ func initClusterConfig() error { if _, ok := metricLookup[mc.Name]; !ok { metricLookup[mc.Name] = schema.GlobalMetricListItem{ - Name: mc.Name, Scope: mc.Scope, Restrict: mc.Restrict, Unit: mc.Unit, Footprint: mc.Footprint, + Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint, } } availability := schema.ClusterSupport{Cluster: cluster.Name} + userAvailability := schema.ClusterSupport{Cluster: cluster.Name} scLookup := make(map[string]*schema.SubClusterConfig) for _, scc := range mc.SubClusters { @@ -94,6 +96,7 @@ func initClusterConfig() error { newMetric.Footprint = mc.Footprint } + isRestricted := mc.Restrict if cfg, ok := scLookup[sc.Name]; ok { if cfg.Remove { continue @@ -105,9 +108,13 @@ func initClusterConfig() error { newMetric.Footprint = cfg.Footprint newMetric.Energy = cfg.Energy newMetric.LowerIsBetter = cfg.LowerIsBetter + isRestricted = cfg.Restrict } availability.SubClusters = append(availability.SubClusters, sc.Name) + if !isRestricted { + userAvailability.SubClusters = append(userAvailability.SubClusters, sc.Name) + } sc.MetricConfig = append(sc.MetricConfig, newMetric) if newMetric.Footprint != "" { @@ -124,6 +131,17 @@ func initClusterConfig() error { item := metricLookup[mc.Name] item.Availability = append(item.Availability, availability) metricLookup[mc.Name] = item + + if len(userAvailability.SubClusters) > 0 { + userItem, ok := userMetricLookup[mc.Name] + if !ok { + userItem = schema.GlobalMetricListItem{ + Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint, + } + } + userItem.Availability = append(userItem.Availability, userAvailability) + userMetricLookup[mc.Name] = userItem + } } Clusters = append(Clusters, cluster) @@ -144,9 +162,9 @@ func initClusterConfig() error { for _, metric := range metricLookup { GlobalMetricList = append(GlobalMetricList, &metric) - if !metric.Restrict { - GlobalUserMetricList = append(GlobalUserMetricList, &metric) - } + } + for _, metric := range userMetricLookup { + GlobalUserMetricList = append(GlobalUserMetricList, &metric) } return nil From d21943a514124b39e3f4aa602b1d6a2feaa708e0 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 10 Feb 2026 07:52:58 +0100 Subject: [PATCH 13/31] Upgrade cc-lib --- go.mod | 2 +- go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 77da0104..6bcc3b08 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ tool ( require ( github.com/99designs/gqlgen v0.17.85 - github.com/ClusterCockpit/cc-lib/v2 v2.2.1 + github.com/ClusterCockpit/cc-lib/v2 v2.2.2 github.com/Masterminds/squirrel v1.5.4 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/config v1.32.6 diff --git a/go.sum b/go.sum index 40b90751..f4f41dfd 100644 --- a/go.sum +++ b/go.sum @@ -6,6 +6,8 @@ github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+ github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= github.com/ClusterCockpit/cc-lib/v2 v2.2.1 h1:iCVas+Jc61zFH5S2VG3H1sc7tsn+U4lOJwUYjYZEims= github.com/ClusterCockpit/cc-lib/v2 v2.2.1/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= +github.com/ClusterCockpit/cc-lib/v2 v2.2.2 h1:ye4RY57I19c2cXr3XWZBS/QYYgQVeGFvsiu5HkyKq9E= +github.com/ClusterCockpit/cc-lib/v2 v2.2.2/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= From 1feb3baf68af9cc6fe5a222013e9d8d47ff54ed8 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 10 Feb 2026 07:53:30 +0100 Subject: [PATCH 14/31] Create copy of test db before unit tests --- internal/repository/node_test.go | 7 +++++++ internal/repository/repository_test.go | 20 ++++++++++++++++++-- internal/repository/stats_test.go | 10 ++-------- internal/repository/userConfig_test.go | 21 +++++++++++++++++++-- 4 files changed, 46 insertions(+), 12 deletions(-) diff --git a/internal/repository/node_test.go b/internal/repository/node_test.go index b863dc69..4286ab34 100644 --- a/internal/repository/node_test.go +++ b/internal/repository/node_test.go @@ -139,6 +139,13 @@ func nodeTestSetup(t *testing.T) { } archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive) + if err := ResetConnection(); err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + ResetConnection() + }) + Connect(dbfilepath) if err := archive.Init(json.RawMessage(archiveCfg)); err != nil { diff --git a/internal/repository/repository_test.go b/internal/repository/repository_test.go index 34852830..b9496143 100644 --- a/internal/repository/repository_test.go +++ b/internal/repository/repository_test.go @@ -6,6 +6,8 @@ package repository import ( "context" + "os" + "path/filepath" "testing" "github.com/ClusterCockpit/cc-backend/internal/graph/model" @@ -148,8 +150,22 @@ func getContext(tb testing.TB) context.Context { func setup(tb testing.TB) *JobRepository { tb.Helper() cclog.Init("warn", true) - dbfile := "testdata/job.db" - err := MigrateDB(dbfile) + + // Copy test DB to a temp file for test isolation + srcData, err := os.ReadFile("testdata/job.db") + noErr(tb, err) + dbfile := filepath.Join(tb.TempDir(), "job.db") + err = os.WriteFile(dbfile, srcData, 0o644) + noErr(tb, err) + + // Reset singletons so Connect uses the new temp DB + err = ResetConnection() + noErr(tb, err) + tb.Cleanup(func() { + ResetConnection() + }) + + err = MigrateDB(dbfile) noErr(tb, err) Connect(dbfile) return GetJobRepository() diff --git a/internal/repository/stats_test.go b/internal/repository/stats_test.go index a8dfc818..a6c2da17 100644 --- a/internal/repository/stats_test.go +++ b/internal/repository/stats_test.go @@ -25,17 +25,11 @@ func TestBuildJobStatsQuery(t *testing.T) { func TestJobStats(t *testing.T) { r := setup(t) - // First, count the actual jobs in the database (excluding test jobs) var expectedCount int - err := r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE cluster != 'testcluster'`).Scan(&expectedCount) + err := r.DB.QueryRow(`SELECT COUNT(*) FROM job`).Scan(&expectedCount) noErr(t, err) - filter := &model.JobFilter{} - // Exclude test jobs created by other tests - testCluster := "testcluster" - filter.Cluster = &model.StringInput{Neq: &testCluster} - - stats, err := r.JobsStats(getContext(t), []*model.JobFilter{filter}) + stats, err := r.JobsStats(getContext(t), []*model.JobFilter{}) noErr(t, err) if stats[0].TotalJobs != expectedCount { diff --git a/internal/repository/userConfig_test.go b/internal/repository/userConfig_test.go index cee59304..17ccbf78 100644 --- a/internal/repository/userConfig_test.go +++ b/internal/repository/userConfig_test.go @@ -31,8 +31,25 @@ func setupUserTest(t *testing.T) *UserCfgRepo { }` cclog.Init("info", true) - dbfilepath := "testdata/job.db" - err := MigrateDB(dbfilepath) + + // Copy test DB to a temp file for test isolation + srcData, err := os.ReadFile("testdata/job.db") + if err != nil { + t.Fatal(err) + } + dbfilepath := filepath.Join(t.TempDir(), "job.db") + if err := os.WriteFile(dbfilepath, srcData, 0o644); err != nil { + t.Fatal(err) + } + + if err := ResetConnection(); err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + ResetConnection() + }) + + err = MigrateDB(dbfilepath) if err != nil { t.Fatal(err) } From 0dff9fa07ff521c00311e7af5ca28d5db40f7ab4 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 10 Feb 2026 09:17:34 +0100 Subject: [PATCH 15/31] Update docs and agent files --- CLAUDE.md | 18 +++++++++--------- README.md | 6 +++--- internal/archiver/README.md | 1 - tools/convert-pem-pubkey/Readme.md | 2 +- web/frontend/README.md | 6 +++--- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 406f11ba..2148fdca 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -22,7 +22,7 @@ make make frontend # Build only the backend (requires frontend to be built first) -go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.4.4 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend +go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.5.0 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend ``` ### Testing @@ -41,7 +41,7 @@ go test ./internal/repository ### Code Generation ```bash -# Regenerate GraphQL schema and resolvers (after modifying api/*.graphqls) +# Regenerate GraphQL schema and resolvers (after modifying api/schema.graphqls) make graphql # Regenerate Swagger/OpenAPI docs (after modifying API comments) @@ -90,7 +90,7 @@ The backend follows a layered architecture with clear separation of concerns: - Transaction support for batch operations - **internal/api**: REST API endpoints (Swagger/OpenAPI documented) - **internal/graph**: GraphQL API (uses gqlgen) - - Schema in `api/*.graphqls` + - Schema in `api/schema.graphqls` - Generated code in `internal/graph/generated/` - Resolvers in `internal/graph/schema.resolvers.go` - **internal/auth**: Authentication layer @@ -108,7 +108,7 @@ The backend follows a layered architecture with clear separation of concerns: - File system backend (default) - S3 backend - SQLite backend (experimental) -- **pkg/nats**: NATS client and message decoding utilities +- **internal/metricstoreclient**: Client for cc-metric-store queries ### Frontend Structure @@ -138,7 +138,7 @@ recommended). Configuration is per-cluster in `config.json`. 3. The first authenticator that returns true performs the actual `Login` 4. JWT tokens are used for API authentication -**Database Migrations**: SQL migrations in `internal/repository/migrations/` are +**Database Migrations**: SQL migrations in `internal/repository/migrations/sqlite3/` are applied automatically on startup. Version tracking in `version` table. **Scopes**: Metrics can be collected at different scopes: @@ -173,7 +173,7 @@ applied automatically on startup. Version tracking in `version` table. **GraphQL** (gqlgen): -- Schema: `api/*.graphqls` +- Schema: `api/schema.graphqls` - Config: `gqlgen.yml` - Generated code: `internal/graph/generated/` - Custom resolvers: `internal/graph/schema.resolvers.go` @@ -182,7 +182,7 @@ applied automatically on startup. Version tracking in `version` table. **Swagger/OpenAPI**: - Annotations in `internal/api/*.go` -- Generated docs: `api/docs.go`, `api/swagger.yaml` +- Generated docs: `internal/api/docs.go`, `api/swagger.yaml` - Run `make swagger` after API changes ## Testing Conventions @@ -196,7 +196,7 @@ applied automatically on startup. Version tracking in `version` table. ### Adding a new GraphQL field -1. Edit schema in `api/*.graphqls` +1. Edit schema in `api/schema.graphqls` 2. Run `make graphql` 3. Implement resolver in `internal/graph/schema.resolvers.go` @@ -215,7 +215,7 @@ applied automatically on startup. Version tracking in `version` table. ### Modifying database schema -1. Create new migration in `internal/repository/migrations/` +1. Create new migration in `internal/repository/migrations/sqlite3/` 2. Increment `repository.Version` 3. Test with fresh database and existing database diff --git a/README.md b/README.md index 475401f4..d01c7140 100644 --- a/README.md +++ b/README.md @@ -173,14 +173,14 @@ ln -s ./var/job-archive Job classification and application detection - [`taskmanager`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/taskmanager) Background task management and scheduled jobs + - [`metricstoreclient`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricstoreclient) + Client for cc-metric-store queries - [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg) contains Go packages that can be used by other projects. - [`archive`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/archive) - Job archive backend implementations (filesystem, S3) + Job archive backend implementations (filesystem, S3, SQLite) - [`metricstore`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/metricstore) In-memory metric data store with checkpointing and metric loading - - [`nats`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/nats) - NATS client and message handling - [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools) Additional command line helper tools. - [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager) diff --git a/internal/archiver/README.md b/internal/archiver/README.md index 48aed797..53d00948 100644 --- a/internal/archiver/README.md +++ b/internal/archiver/README.md @@ -170,7 +170,6 @@ All exported functions are safe for concurrent use: - `Start()` - Safe to call once - `TriggerArchiving()` - Safe from multiple goroutines - `Shutdown()` - Safe to call once -- `WaitForArchiving()` - Deprecated, but safe Internal state is protected by: - Channel synchronization (`archiveChannel`) diff --git a/tools/convert-pem-pubkey/Readme.md b/tools/convert-pem-pubkey/Readme.md index 1429acc4..22fd0db2 100644 --- a/tools/convert-pem-pubkey/Readme.md +++ b/tools/convert-pem-pubkey/Readme.md @@ -16,7 +16,7 @@ CROSS_LOGIN_JWT_PUBLIC_KEY="+51iXX8BdLFocrppRxIw52xCOf8xFSH/eNilN5IHVGc=" Instructions -- `cd tools/convert-pem-pubkey-for-cc/` +- `cd tools/convert-pem-pubkey/` - Insert your public ed25519 PEM key into `dummy.pub` - `go run . dummy.pub` - Copy the result into ClusterCockpit's `.env` diff --git a/web/frontend/README.md b/web/frontend/README.md index d61d302e..4dff4405 100644 --- a/web/frontend/README.md +++ b/web/frontend/README.md @@ -1,11 +1,11 @@ # cc-frontend -[![Build](https://github.com/ClusterCockpit/cc-svelte-datatable/actions/workflows/build.yml/badge.svg)](https://github.com/ClusterCockpit/cc-svelte-datatable/actions/workflows/build.yml) +[![Build](https://github.com/ClusterCockpit/cc-backend/actions/workflows/test.yml/badge.svg)](https://github.com/ClusterCockpit/cc-backend/actions/workflows/test.yml) -A frontend for [ClusterCockpit](https://github.com/ClusterCockpit/ClusterCockpit) and [cc-backend](https://github.com/ClusterCockpit/cc-backend). Backend specific configuration can de done using the constants defined in the `intro` section in `./rollup.config.js`. +A frontend for [ClusterCockpit](https://github.com/ClusterCockpit/ClusterCockpit) and [cc-backend](https://github.com/ClusterCockpit/cc-backend). Backend specific configuration can be done using the constants defined in the `intro` section in `./rollup.config.mjs`. Builds on: -* [Svelte](https://svelte.dev/) +* [Svelte 5](https://svelte.dev/) * [SvelteStrap](https://sveltestrap.js.org/) * [Bootstrap 5](https://getbootstrap.com/) * [urql](https://github.com/FormidableLabs/urql) From 49a1748641e7b6875659e6ab44c89791c2df943d Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Tue, 10 Feb 2026 13:49:23 +0100 Subject: [PATCH 16/31] add not configured info cards, show short job filter options if one active filter --- go.sum | 2 - web/frontend/src/Job.root.svelte | 22 +++++++--- web/frontend/src/Jobs.root.svelte | 3 +- web/frontend/src/Node.root.svelte | 36 +++++++++++------ web/frontend/src/User.root.svelte | 3 +- .../src/generic/joblist/JobListRow.svelte | 11 +++-- web/frontend/src/generic/utils.js | 40 +++++++++++++------ web/frontend/src/systems/NodeOverview.svelte | 19 +++++++-- .../src/systems/nodelist/NodeListRow.svelte | 32 +++++---------- 9 files changed, 105 insertions(+), 63 deletions(-) diff --git a/go.sum b/go.sum index f4f41dfd..a407436f 100644 --- a/go.sum +++ b/go.sum @@ -4,8 +4,6 @@ github.com/99designs/gqlgen v0.17.85 h1:EkGx3U2FDcxQm8YDLQSpXIAVmpDyZ3IcBMOJi2nH github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= -github.com/ClusterCockpit/cc-lib/v2 v2.2.1 h1:iCVas+Jc61zFH5S2VG3H1sc7tsn+U4lOJwUYjYZEims= -github.com/ClusterCockpit/cc-lib/v2 v2.2.1/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/ClusterCockpit/cc-lib/v2 v2.2.2 h1:ye4RY57I19c2cXr3XWZBS/QYYgQVeGFvsiu5HkyKq9E= github.com/ClusterCockpit/cc-lib/v2 v2.2.2/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 99dfa7ac..3baed1c1 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -30,7 +30,7 @@ import { init, groupByScope, - checkMetricDisabled, + checkMetricAvailability, } from "./generic/utils.js"; import Metric from "./job/Metric.svelte"; import MetricSelection from "./generic/select/MetricSelection.svelte"; @@ -151,17 +151,17 @@ } return names; }, []); - + // return metricNames.filter( (metric) => !metrics.some((jm) => jm.name == metric) && selectedMetrics.includes(metric) && - !checkMetricDisabled( + (checkMetricAvailability( globalMetrics, metric, thisJob.cluster, thisJob.subCluster, - ), + ) == "configured") ); } else { return [] @@ -212,7 +212,7 @@ inputMetrics.map((metric) => ({ metric: metric, data: grouped.find((group) => group[0].name == metric), - disabled: checkMetricDisabled( + availability: checkMetricAvailability( globalMetrics, metric, thisJob.cluster, @@ -333,7 +333,17 @@ {:else if thisJob && $jobMetrics?.data?.scopedJobStats} {#snippet gridContent(item)} - {#if item?.disabled} + {#if item.availability == "none"} + + + Metric not configured + + +

No datasets returned for {item.metric}.

+

Metric is not configured for cluster {thisJob.cluster}.

+
+
+ {:else if item.availability == "disabled"} Disabled Metric diff --git a/web/frontend/src/Jobs.root.svelte b/web/frontend/src/Jobs.root.svelte index 0d543fc8..a06aee3c 100644 --- a/web/frontend/src/Jobs.root.svelte +++ b/web/frontend/src/Jobs.root.svelte @@ -142,7 +142,8 @@ 0)} shortJobCutoff={ccconfig?.jobList_hideShortRunningJobs} showFilter={!showCompare} matchedJobs={showCompare? matchedCompareJobs: matchedListJobs} diff --git a/web/frontend/src/Node.root.svelte b/web/frontend/src/Node.root.svelte index d3364b49..06056466 100644 --- a/web/frontend/src/Node.root.svelte +++ b/web/frontend/src/Node.root.svelte @@ -32,7 +32,7 @@ } from "@urql/svelte"; import { init, - checkMetricDisabled, + checkMetricAvailability, } from "./generic/utils.js"; import PlotGrid from "./generic/PlotGrid.svelte"; import MetricPlot from "./generic/plots/MetricPlot.svelte"; @@ -242,17 +242,17 @@ {item.name} {systemUnits[item.name] ? "(" + systemUnits[item.name] + ")" : ""} - {#if item.disabled === false && item.metric} - c.name == cluster)} - subCluster={$nodeMetricsData.data.nodeMetrics[0].subCluster} - series={item.metric.series} - enableFlip - forNode - /> - {:else if item.disabled === true && item.metric} + {#if item.availability == "none"} + + + Metric not configured + + +

No datasets returned for {item.name}.

+

Metric is not configured for cluster {cluster}.

+
+
+ {:else if item.availability == "disabled"} Disabled Metric @@ -262,6 +262,16 @@

Metric has been disabled for subcluster {$nodeMetricsData.data.nodeMetrics[0].subCluster}.

+ {:else if item?.metric} + c.name == cluster)} + subCluster={$nodeMetricsData.data.nodeMetrics[0].subCluster} + series={item.metric.series} + enableFlip + forNode + /> {:else} @@ -279,7 +289,7 @@ items={$nodeMetricsData.data.nodeMetrics[0].metrics .map((m) => ({ ...m, - disabled: checkMetricDisabled( + availability: checkMetricAvailability( globalMetrics, m.name, cluster, diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index 4ee3f892..d086df14 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -219,7 +219,8 @@ 0)} shortJobCutoff={ccconfig?.jobList_hideShortRunningJobs} showFilter={!showCompare} matchedJobs={showCompare? matchedCompareJobs: matchedListJobs} diff --git a/web/frontend/src/generic/joblist/JobListRow.svelte b/web/frontend/src/generic/joblist/JobListRow.svelte index 5d129ad0..9db340d4 100644 --- a/web/frontend/src/generic/joblist/JobListRow.svelte +++ b/web/frontend/src/generic/joblist/JobListRow.svelte @@ -19,7 +19,7 @@ @@ -169,7 +152,12 @@ {#each refinedData as metricData, i (metricData?.data?.name || i)} - {#if metricData?.disabled} + {#if metricData?.availability == "none"} + +

No dataset(s) returned for {selectedMetrics[i]}

+

Metric is not configured for cluster {cluster}.

+
+ {:else if metricData?.availability == "disabled"}

No dataset(s) returned for {selectedMetrics[i]}

Metric has been disabled for subcluster {nodeData.subCluster}.

@@ -177,7 +165,7 @@ {:else if !metricData?.data}

No dataset(s) returned for {selectedMetrics[i]}

-

Metric was not found in metric store for cluster {cluster}.

+

Metric or host was not found in metric store for cluster {cluster}.

{:else if !!metricData.data?.metric.statisticsSeries} From a5a1fd1a6a8da35bbc3316b4ed73fb8660baaef1 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Tue, 10 Feb 2026 15:47:38 +0100 Subject: [PATCH 17/31] fix missing component argument --- web/frontend/src/status/dashdetails/StatisticsDash.svelte | 2 ++ 1 file changed, 2 insertions(+) diff --git a/web/frontend/src/status/dashdetails/StatisticsDash.svelte b/web/frontend/src/status/dashdetails/StatisticsDash.svelte index 42c6823f..2cf8621e 100644 --- a/web/frontend/src/status/dashdetails/StatisticsDash.svelte +++ b/web/frontend/src/status/dashdetails/StatisticsDash.svelte @@ -35,6 +35,7 @@ /* Const Init */ const ccconfig = getContext("cc-config"); + const globalMetrics = getContext("globalMetrics"); const client = getContextClient(); /* State Init */ @@ -139,6 +140,7 @@ Date: Tue, 10 Feb 2026 16:46:18 +0100 Subject: [PATCH 18/31] revert external config supply for nodeList component --- internal/graph/schema.resolvers.go | 3 ++- web/frontend/src/Systems.root.svelte | 4 ++-- web/frontend/src/systems/NodeList.svelte | 17 +++++++++-------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 19d04eab..a233c3ba 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -867,7 +867,8 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub } nodeMetricsListResult := &model.NodesResultList{ - Items: nodeMetricsList, + Items: nodeMetricsList, + // TotalNodes depends on sum of nodes grouped on latest timestamp, see repo/node.go:357 TotalNodes: &countNodes, HasNextPage: &hasNextPage, } diff --git a/web/frontend/src/Systems.root.svelte b/web/frontend/src/Systems.root.svelte index fb5c4495..d89b5f06 100644 --- a/web/frontend/src/Systems.root.svelte +++ b/web/frontend/src/Systems.root.svelte @@ -272,8 +272,8 @@ {:else} - + {/if} {/if} diff --git a/web/frontend/src/systems/NodeList.svelte b/web/frontend/src/systems/NodeList.svelte index 4e8b45d9..da196b82 100644 --- a/web/frontend/src/systems/NodeList.svelte +++ b/web/frontend/src/systems/NodeList.svelte @@ -4,8 +4,6 @@ Properties: - `cluster String`: The nodes' cluster - `subCluster String`: The nodes' subCluster [Default: ""] - - `ccconfig Object?`: The ClusterCockpit Config Context [Default: null] - - `globalMetrics [Obj]`: Includes the backend supplied availabilities for cluster and subCluster - `pendingSelectedMetrics [String]`: The array of selected metrics [Default []] - `selectedResolution Number?`: The selected data resolution [Default: 0] - `hostnameFilter String?`: The active hostnamefilter [Default: ""] @@ -16,7 +14,7 @@ --> From 8d6c6b819b3f25b8a498930c5709a9f97734a3ac Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 11 Feb 2026 07:06:06 +0100 Subject: [PATCH 20/31] Update and port to cc-lib --- cmd/cc-backend/main.go | 2 +- go.mod | 2 +- go.sum | 2 ++ internal/api/cluster.go | 4 ++-- internal/api/job.go | 4 ++-- internal/api/user.go | 2 +- internal/auth/auth.go | 12 ++++++------ internal/graph/generated/generated.go | 2 +- internal/graph/schema.resolvers.go | 2 +- internal/metricdispatch/dataLoader.go | 2 +- internal/metricdispatch/dataLoader_test.go | 2 +- internal/metricstoreclient/cc-metric-store.go | 6 +++--- internal/repository/jobQuery.go | 2 +- internal/repository/tags.go | 4 ++-- pkg/archive/json.go | 2 +- pkg/archive/parquet/convert.go | 1 - pkg/archive/parquet/writer_test.go | 1 - pkg/metricstore/query.go | 6 +++--- 18 files changed, 29 insertions(+), 29 deletions(-) diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 3c70a960..3ee05383 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -248,7 +248,7 @@ func generateJWT(authHandle *auth.Authentication, username string) error { return fmt.Errorf("getting user '%s': %w", username, err) } - if !user.HasRole(schema.RoleApi) { + if !user.HasRole(schema.RoleAPI) { cclog.Warnf("JWT: User '%s' does not have the role 'api'. REST API endpoints will return error!\n", user.Username) } diff --git a/go.mod b/go.mod index 6bcc3b08..fedc6a22 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ tool ( require ( github.com/99designs/gqlgen v0.17.85 - github.com/ClusterCockpit/cc-lib/v2 v2.2.2 + github.com/ClusterCockpit/cc-lib/v2 v2.3.0 github.com/Masterminds/squirrel v1.5.4 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/config v1.32.6 diff --git a/go.sum b/go.sum index a407436f..5573c63a 100644 --- a/go.sum +++ b/go.sum @@ -6,6 +6,8 @@ github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+ github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= github.com/ClusterCockpit/cc-lib/v2 v2.2.2 h1:ye4RY57I19c2cXr3XWZBS/QYYgQVeGFvsiu5HkyKq9E= github.com/ClusterCockpit/cc-lib/v2 v2.2.2/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= +github.com/ClusterCockpit/cc-lib/v2 v2.3.0 h1:69NqCAYCU1r2w6J5Yuxoe8jfR68VLqtWwsWXZ6KTOo4= +github.com/ClusterCockpit/cc-lib/v2 v2.3.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= diff --git a/internal/api/cluster.go b/internal/api/cluster.go index d1c3c898..5e6e3a27 100644 --- a/internal/api/cluster.go +++ b/internal/api/cluster.go @@ -36,9 +36,9 @@ type GetClustersAPIResponse struct { // @router /api/clusters/ [get] func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) { if user := repository.GetUserFromContext(r.Context()); user != nil && - !user.HasRole(schema.RoleApi) { + !user.HasRole(schema.RoleAPI) { - handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) + handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw) return } diff --git a/internal/api/job.go b/internal/api/job.go index 9bd93b1c..66258668 100644 --- a/internal/api/job.go +++ b/internal/api/job.go @@ -1054,8 +1054,8 @@ type GetUsedNodesAPIResponse struct { // @router /api/jobs/used_nodes [get] func (api *RestAPI) getUsedNodes(rw http.ResponseWriter, r *http.Request) { if user := repository.GetUserFromContext(r.Context()); user != nil && - !user.HasRole(schema.RoleApi) { - handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) + !user.HasRole(schema.RoleAPI) { + handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw) return } diff --git a/internal/api/user.go b/internal/api/user.go index 5eba0dfc..e2f78165 100644 --- a/internal/api/user.go +++ b/internal/api/user.go @@ -164,7 +164,7 @@ func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) { return } - if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) { + if len(password) == 0 && role != schema.GetRoleString(schema.RoleAPI) { handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw) return } diff --git a/internal/auth/auth.go b/internal/auth/auth.go index 8a2073b5..9b1e2121 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -448,13 +448,13 @@ func (auth *Authentication) AuthAPI( if user != nil { switch { case len(user.Roles) == 1: - if user.HasRole(schema.RoleApi) { + if user.HasRole(schema.RoleAPI) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return } case len(user.Roles) >= 2: - if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) { + if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleAPI}) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return @@ -484,13 +484,13 @@ func (auth *Authentication) AuthUserAPI( if user != nil { switch { case len(user.Roles) == 1: - if user.HasRole(schema.RoleApi) { + if user.HasRole(schema.RoleAPI) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return } case len(user.Roles) >= 2: - if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) { + if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return @@ -520,13 +520,13 @@ func (auth *Authentication) AuthMetricStoreAPI( if user != nil { switch { case len(user.Roles) == 1: - if user.HasRole(schema.RoleApi) { + if user.HasRole(schema.RoleAPI) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return } case len(user.Roles) >= 2: - if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) { + if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index e1e5ea71..965fd860 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -10245,7 +10245,7 @@ func (ec *executionContext) _Series_id(ctx context.Context, field graphql.Collec field, ec.fieldContext_Series_id, func(ctx context.Context) (any, error) { - return obj.Id, nil + return obj.ID, nil }, nil, ec.marshalOString2áš–string, diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 19d04eab..af04d94d 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -552,7 +552,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [ for _, stat := range stats { mdlStats = append(mdlStats, &model.ScopedStats{ Hostname: stat.Hostname, - ID: stat.Id, + ID: stat.ID, Data: stat.Data, }) } diff --git a/internal/metricdispatch/dataLoader.go b/internal/metricdispatch/dataLoader.go index 78808a74..c420fee4 100644 --- a/internal/metricdispatch/dataLoader.go +++ b/internal/metricdispatch/dataLoader.go @@ -499,7 +499,7 @@ func copyJobMetric(src *schema.JobMetric) *schema.JobMetric { func copySeries(src *schema.Series) schema.Series { dst := schema.Series{ Hostname: src.Hostname, - Id: src.Id, + ID: src.ID, Statistics: src.Statistics, Data: make([]schema.Float, len(src.Data)), } diff --git a/internal/metricdispatch/dataLoader_test.go b/internal/metricdispatch/dataLoader_test.go index c4841f8d..65a366f9 100644 --- a/internal/metricdispatch/dataLoader_test.go +++ b/internal/metricdispatch/dataLoader_test.go @@ -21,7 +21,7 @@ func TestDeepCopy(t *testing.T) { Series: []schema.Series{ { Hostname: "node001", - Id: &nodeId, + ID: &nodeId, Data: []schema.Float{1.0, 2.0, 3.0}, Statistics: schema.MetricStatistics{ Min: 1.0, diff --git a/internal/metricstoreclient/cc-metric-store.go b/internal/metricstoreclient/cc-metric-store.go index aadbe1b1..4472b825 100644 --- a/internal/metricstoreclient/cc-metric-store.go +++ b/internal/metricstoreclient/cc-metric-store.go @@ -267,7 +267,7 @@ func (ccms *CCMetricStore) LoadData( jobMetric.Series = append(jobMetric.Series, schema.Series{ Hostname: query.Hostname, - Id: id, + ID: id, Statistics: schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), @@ -419,7 +419,7 @@ func (ccms *CCMetricStore) LoadScopedStats( scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ Hostname: query.Hostname, - Id: id, + ID: id, Data: &schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), @@ -634,7 +634,7 @@ func (ccms *CCMetricStore) LoadNodeListData( scopeData.Series = append(scopeData.Series, schema.Series{ Hostname: query.Hostname, - Id: id, + ID: id, Statistics: schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go index 658413e8..81779583 100644 --- a/internal/repository/jobQuery.go +++ b/internal/repository/jobQuery.go @@ -150,7 +150,7 @@ func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.Select } switch { - case len(user.Roles) == 1 && user.HasRole(schema.RoleApi): + case len(user.Roles) == 1 && user.HasRole(schema.RoleAPI): return query, nil case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): return query, nil diff --git a/internal/repository/tags.go b/internal/repository/tags.go index 612666da..943dda66 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -644,12 +644,12 @@ func (r *JobRepository) checkScopeAuth(user *schema.User, operation string, scop if user != nil { switch { case operation == "write" && scope == "admin": - if user.HasRole(schema.RoleAdmin) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) { + if user.HasRole(schema.RoleAdmin) || (len(user.Roles) == 1 && user.HasRole(schema.RoleAPI)) { return true, nil } return false, nil case operation == "write" && scope == "global": - if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) { + if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) || (len(user.Roles) == 1 && user.HasRole(schema.RoleAPI)) { return true, nil } return false, nil diff --git a/pkg/archive/json.go b/pkg/archive/json.go index cf1b0a38..dd37075d 100644 --- a/pkg/archive/json.go +++ b/pkg/archive/json.go @@ -51,7 +51,7 @@ func DecodeJobStats(r io.Reader, k string) (schema.ScopedJobStats, error) { for _, series := range jobMetric.Series { scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ Hostname: series.Hostname, - Id: series.Id, + ID: series.ID, Data: &series.Statistics, }) } diff --git a/pkg/archive/parquet/convert.go b/pkg/archive/parquet/convert.go index ceaa3f2f..ba1e76eb 100644 --- a/pkg/archive/parquet/convert.go +++ b/pkg/archive/parquet/convert.go @@ -81,7 +81,6 @@ func JobToParquetRow(meta *schema.Job, data *schema.JobData) (*ParquetJobRow, er NumNodes: meta.NumNodes, NumHWThreads: meta.NumHWThreads, NumAcc: meta.NumAcc, - Exclusive: meta.Exclusive, Energy: meta.Energy, SMT: meta.SMT, ResourcesJSON: resourcesJSON, diff --git a/pkg/archive/parquet/writer_test.go b/pkg/archive/parquet/writer_test.go index 6baaa527..e532e472 100644 --- a/pkg/archive/parquet/writer_test.go +++ b/pkg/archive/parquet/writer_test.go @@ -47,7 +47,6 @@ func makeTestJob(jobID int64) (*schema.Job, *schema.JobData) { Walltime: 7200, NumNodes: 2, NumHWThreads: 16, - Exclusive: 1, SMT: 1, Resources: []*schema.Resource{ {Hostname: "node001"}, diff --git a/pkg/metricstore/query.go b/pkg/metricstore/query.go index e5a49af3..709a9710 100644 --- a/pkg/metricstore/query.go +++ b/pkg/metricstore/query.go @@ -149,7 +149,7 @@ func (ccms *InternalMetricStore) LoadData( jobMetric.Series = append(jobMetric.Series, schema.Series{ Hostname: query.Hostname, - Id: id, + ID: id, Statistics: schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), @@ -651,7 +651,7 @@ func (ccms *InternalMetricStore) LoadScopedStats( scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ Hostname: query.Hostname, - Id: id, + ID: id, Data: &schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), @@ -894,7 +894,7 @@ func (ccms *InternalMetricStore) LoadNodeListData( scopeData.Series = append(scopeData.Series, schema.Series{ Hostname: query.Hostname, - Id: id, + ID: id, Statistics: schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), From 12e9f6700efbe20181eb4c9f653b0c2368e7de76 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Wed, 11 Feb 2026 16:16:09 +0100 Subject: [PATCH 21/31] fix nodeList resolver data handling, increase nodestate filter cutoff - add comment on cutoff --- internal/graph/schema.resolvers.go | 8 +++++--- internal/repository/node.go | 12 ++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index eb565b7b..059bd16d 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -824,6 +824,7 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub } nodeRepo := repository.GetNodeRepository() + // nodes -> array hostname nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page) if nerr != nil { return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList") @@ -835,6 +836,7 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub } } + // data -> map hostname:jobdata data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx) if err != nil { cclog.Warn("error while loading node data (Resolver.NodeMetricsList") @@ -842,18 +844,18 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub } nodeMetricsList := make([]*model.NodeMetrics, 0, len(data)) - for hostname, metrics := range data { + for _, hostname := range nodes { host := &model.NodeMetrics{ Host: hostname, State: stateMap[hostname], - Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)), + Metrics: make([]*model.JobMetricWithName, 0), } host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname) if err != nil { cclog.Warnf("error in nodeMetrics resolver: %s", err) } - for metric, scopedMetrics := range metrics { + for metric, scopedMetrics := range data[hostname] { for scope, scopedMetric := range scopedMetrics { host.Metrics = append(host.Metrics, &model.JobMetricWithName{ Name: metric, diff --git a/internal/repository/node.go b/internal/repository/node.go index df3aec8b..42e7b101 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -263,14 +263,16 @@ func (r *NodeRepository) QueryNodes( if f.SchedulerState != nil { query = query.Where("node_state = ?", f.SchedulerState) // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned + // TODO: Hardcoded TimeDiff Suboptimal - Use Config Option? now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) + query = query.Where(sq.Gt{"time_stamp": (now - 300)}) } if f.HealthState != nil { query = query.Where("health_state = ?", f.HealthState) // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned + // TODO: Hardcoded TimeDiff Suboptimal - Use Config Option? now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) + query = query.Where(sq.Gt{"time_stamp": (now - 300)}) } } @@ -331,14 +333,16 @@ func (r *NodeRepository) CountNodes( if f.SchedulerState != nil { query = query.Where("node_state = ?", f.SchedulerState) // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned + // TODO: Hardcoded TimeDiff Suboptimal - Use Config Option? now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) + query = query.Where(sq.Gt{"time_stamp": (now - 300)}) } if f.HealthState != nil { query = query.Where("health_state = ?", f.HealthState) // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned + // TODO: Hardcoded TimeDiff Suboptimal - Use Config Option? now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) + query = query.Where(sq.Gt{"time_stamp": (now - 300)}) } } From e75da7f8cc1376ed835f496fa85173e8f28e3ffc Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Wed, 11 Feb 2026 18:32:29 +0100 Subject: [PATCH 22/31] fix reactivity key placement in nodeList --- .../src/systems/nodelist/NodeListRow.svelte | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index b5bb9d77..46f8c4a4 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -151,24 +151,25 @@ {/if} {#each refinedData as metricData, i (metricData?.data?.name || i)} - - {#if metricData?.availability == "none"} + {#key metricData} + + {#if metricData?.availability == "none"}

No dataset(s) returned for {selectedMetrics[i]}

Metric is not configured for cluster {cluster}.

- {:else if metricData?.availability == "disabled"} - -

No dataset(s) returned for {selectedMetrics[i]}

-

Metric has been disabled for subcluster {nodeData.subCluster}.

-
- {:else if !metricData?.data} - -

No dataset(s) returned for {selectedMetrics[i]}

-

Metric or host was not found in metric store for cluster {cluster}.

-
- {:else if !!metricData.data?.metric.statisticsSeries} - + {:else if metricData?.availability == "disabled"} + +

No dataset(s) returned for {selectedMetrics[i]}

+

Metric has been disabled for subcluster {nodeData.subCluster}.

+
+ {:else if !metricData?.data} + +

No dataset(s) returned for {selectedMetrics[i]}

+

Metric or host was not found in metric store for cluster {cluster}.

+
+ {:else if !!metricData.data?.metric.statisticsSeries} + -
- {#key extendedLegendData} +
- {/key} - {:else} + {:else} - {/if} - + {/if} + + {/key} {/each} From f4ee0d10424d7153f8f59ac880cabe176fba5d1c Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 12 Feb 2026 07:34:24 +0100 Subject: [PATCH 23/31] Update cc-lib and extend nodestate sql schema --- go.mod | 2 +- go.sum | 2 ++ internal/repository/migrations/sqlite3/10_node-table.up.sql | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index fedc6a22..f9bf7e42 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ tool ( require ( github.com/99designs/gqlgen v0.17.85 - github.com/ClusterCockpit/cc-lib/v2 v2.3.0 + github.com/ClusterCockpit/cc-lib/v2 v2.4.0 github.com/Masterminds/squirrel v1.5.4 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/config v1.32.6 diff --git a/go.sum b/go.sum index 5573c63a..8db4d1a3 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/ClusterCockpit/cc-lib/v2 v2.2.2 h1:ye4RY57I19c2cXr3XWZBS/QYYgQVeGFvsi github.com/ClusterCockpit/cc-lib/v2 v2.2.2/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/ClusterCockpit/cc-lib/v2 v2.3.0 h1:69NqCAYCU1r2w6J5Yuxoe8jfR68VLqtWwsWXZ6KTOo4= github.com/ClusterCockpit/cc-lib/v2 v2.3.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= +github.com/ClusterCockpit/cc-lib/v2 v2.4.0 h1:OnZlvqSatg7yCQ2NtSR7AddpUVSiuSMZ8scF1a7nfOk= +github.com/ClusterCockpit/cc-lib/v2 v2.4.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= diff --git a/internal/repository/migrations/sqlite3/10_node-table.up.sql b/internal/repository/migrations/sqlite3/10_node-table.up.sql index 7b5b5ac7..fd118f5d 100644 --- a/internal/repository/migrations/sqlite3/10_node-table.up.sql +++ b/internal/repository/migrations/sqlite3/10_node-table.up.sql @@ -23,6 +23,7 @@ CREATE TABLE "node_state" ( CHECK (health_state IN ( 'full', 'partial', 'failed' )), + health_metrics TEXT, -- JSON array of strings node_id INTEGER, FOREIGN KEY (node_id) REFERENCES node (id) ); From 865cd3db54cbb0ee7713b2fcd7c0ef2dd98e46b4 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 12 Feb 2026 08:48:15 +0100 Subject: [PATCH 24/31] Prersist faulty nodestate metric lists to db --- go.mod | 2 ++ go.sum | 6 ------ internal/api/node.go | 18 +++++++++++------ internal/repository/node.go | 7 ++++--- pkg/metricstore/healthcheck.go | 31 +++++++++++++++++++++++------ pkg/metricstore/metricstore_test.go | 4 ++-- 6 files changed, 45 insertions(+), 23 deletions(-) diff --git a/go.mod b/go.mod index f9bf7e42..b35eafe5 100644 --- a/go.mod +++ b/go.mod @@ -124,3 +124,5 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) + +replace github.com/ClusterCockpit/cc-lib/v2 => ../cc-lib diff --git a/go.sum b/go.sum index 8db4d1a3..d5bbe045 100644 --- a/go.sum +++ b/go.sum @@ -4,12 +4,6 @@ github.com/99designs/gqlgen v0.17.85 h1:EkGx3U2FDcxQm8YDLQSpXIAVmpDyZ3IcBMOJi2nH github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= -github.com/ClusterCockpit/cc-lib/v2 v2.2.2 h1:ye4RY57I19c2cXr3XWZBS/QYYgQVeGFvsiu5HkyKq9E= -github.com/ClusterCockpit/cc-lib/v2 v2.2.2/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= -github.com/ClusterCockpit/cc-lib/v2 v2.3.0 h1:69NqCAYCU1r2w6J5Yuxoe8jfR68VLqtWwsWXZ6KTOo4= -github.com/ClusterCockpit/cc-lib/v2 v2.3.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= -github.com/ClusterCockpit/cc-lib/v2 v2.4.0 h1:OnZlvqSatg7yCQ2NtSR7AddpUVSiuSMZ8scF1a7nfOk= -github.com/ClusterCockpit/cc-lib/v2 v2.4.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= diff --git a/internal/api/node.go b/internal/api/node.go index 27cde7f0..e6b19479 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -80,7 +80,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { ms := metricstore.GetMemoryStore() m := make(map[string][]string) - healthStates := make(map[string]schema.MonitoringState) + healthResults := make(map[string]metricstore.HealthCheckResult) startMs := time.Now() @@ -94,8 +94,8 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { if sc != "" { metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc) metricNames := metricListToNames(metricList) - if states, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil { - maps.Copy(healthStates, states) + if results, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil { + maps.Copy(healthResults, results) } } } @@ -106,8 +106,10 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { for _, node := range req.Nodes { state := determineState(node.States) healthState := schema.MonitoringStateFailed - if hs, ok := healthStates[node.Hostname]; ok { - healthState = hs + var healthMetrics string + if result, ok := healthResults[node.Hostname]; ok { + healthState = result.State + healthMetrics = result.HealthMetrics } nodeState := schema.NodeStateDB{ TimeStamp: requestReceived, @@ -116,10 +118,14 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { MemoryAllocated: node.MemoryAllocated, GpusAllocated: node.GpusAllocated, HealthState: healthState, + HealthMetrics: healthMetrics, JobsRunning: node.JobsRunning, } - repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState) + if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil { + cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v", + node.Hostname, req.Cluster, err) + } } cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB)) diff --git a/internal/repository/node.go b/internal/repository/node.go index 42e7b101..82dcf067 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -169,9 +169,10 @@ func (r *NodeRepository) AddNode(node *schema.NodeDB) (int64, error) { } const NamedNodeStateInsert string = ` -INSERT INTO node_state (time_stamp, node_state, health_state, cpus_allocated, - memory_allocated, gpus_allocated, jobs_running, node_id) - VALUES (:time_stamp, :node_state, :health_state, :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);` +INSERT INTO node_state (time_stamp, node_state, health_state, health_metrics, + cpus_allocated, memory_allocated, gpus_allocated, jobs_running, node_id) + VALUES (:time_stamp, :node_state, :health_state, :health_metrics, + :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);` // TODO: Add real Monitoring Health State diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go index ed1ff38e..d6def692 100644 --- a/pkg/metricstore/healthcheck.go +++ b/pkg/metricstore/healthcheck.go @@ -6,6 +6,7 @@ package metricstore import ( + "encoding/json" "fmt" "time" @@ -19,6 +20,13 @@ type HealthCheckResponse struct { Error error } +// HealthCheckResult holds the monitoring state and raw JSON health metrics +// for a single node as determined by HealthCheck. +type HealthCheckResult struct { + State schema.MonitoringState + HealthMetrics string // JSON: {"missing":[...],"degraded":[...]} +} + // MaxMissingDataPoints is the threshold for stale data detection. // A buffer is considered healthy if the gap between its last data point // and the current time is within MaxMissingDataPoints * frequency. @@ -134,15 +142,15 @@ func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []str // - MonitoringStateFailed: node not found, or no healthy metrics at all func (m *MemoryStore) HealthCheck(cluster string, nodes []string, expectedMetrics []string, -) (map[string]schema.MonitoringState, error) { - results := make(map[string]schema.MonitoringState, len(nodes)) +) (map[string]HealthCheckResult, error) { + results := make(map[string]HealthCheckResult, len(nodes)) for _, hostname := range nodes { selector := []string{cluster, hostname} degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics) if err != nil { - results[hostname] = schema.MonitoringStateFailed + results[hostname] = HealthCheckResult{State: schema.MonitoringStateFailed} continue } @@ -158,13 +166,24 @@ func (m *MemoryStore) HealthCheck(cluster string, cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList) } + var state schema.MonitoringState switch { case degradedCount == 0 && missingCount == 0: - results[hostname] = schema.MonitoringStateFull + state = schema.MonitoringStateFull case healthyCount == 0: - results[hostname] = schema.MonitoringStateFailed + state = schema.MonitoringStateFailed default: - results[hostname] = schema.MonitoringStatePartial + state = schema.MonitoringStatePartial + } + + hm, _ := json.Marshal(map[string][]string{ + "missing": missingList, + "degraded": degradedList, + }) + + results[hostname] = HealthCheckResult{ + State: state, + HealthMetrics: string(hm), } } diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index 4d68d76c..a9ff0055 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -253,8 +253,8 @@ func TestHealthCheck(t *testing.T) { // Check status if wantStatus, ok := tt.wantStates[node]; ok { - if state != wantStatus { - t.Errorf("HealthCheck() node %s status = %v, want %v", node, state, wantStatus) + if state.State != wantStatus { + t.Errorf("HealthCheck() node %s status = %v, want %v", node, state.State, wantStatus) } } } From 54ea5d790054dc87b82d43b61b9ac6b180f36684 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 12 Feb 2026 09:21:44 +0100 Subject: [PATCH 25/31] Add nodestate retention and archiving --- internal/config/config.go | 17 +++ internal/config/schema.go | 53 ++++++++ internal/repository/node.go | 65 ++++++++++ .../taskmanager/nodestateRetentionService.go | 120 ++++++++++++++++++ internal/taskmanager/taskManager.go | 21 +++ pkg/archive/parquet/nodestate_schema.go | 20 +++ pkg/archive/parquet/nodestate_writer.go | 104 +++++++++++++++ 7 files changed, 400 insertions(+) create mode 100644 internal/taskmanager/nodestateRetentionService.go create mode 100644 pkg/archive/parquet/nodestate_schema.go create mode 100644 pkg/archive/parquet/nodestate_writer.go diff --git a/internal/config/config.go b/internal/config/config.go index 4e6fe975..2e601ed7 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -71,6 +71,23 @@ type ProgramConfig struct { // If exists, will enable dynamic zoom in frontend metric plots using the configured values EnableResampling *ResampleConfig `json:"resampling"` + + // Node state retention configuration + NodeStateRetention *NodeStateRetention `json:"nodestate-retention"` +} + +type NodeStateRetention struct { + Policy string `json:"policy"` // "delete" or "parquet" + Age int `json:"age"` // hours, default 24 + TargetKind string `json:"target-kind"` // "file" or "s3" + TargetPath string `json:"target-path"` + TargetEndpoint string `json:"target-endpoint"` + TargetBucket string `json:"target-bucket"` + TargetAccessKey string `json:"target-access-key"` + TargetSecretKey string `json:"target-secret-key"` + TargetRegion string `json:"target-region"` + TargetUsePathStyle bool `json:"target-use-path-style"` + MaxFileSizeMB int `json:"max-file-size-mb"` } type ResampleConfig struct { diff --git a/internal/config/schema.go b/internal/config/schema.go index 0d575b3c..bd1b314e 100644 --- a/internal/config/schema.go +++ b/internal/config/schema.go @@ -130,6 +130,59 @@ var configSchema = ` } }, "required": ["subject-job-event", "subject-node-state"] + }, + "nodestate-retention": { + "description": "Node state retention configuration for cleaning up old node_state rows.", + "type": "object", + "properties": { + "policy": { + "description": "Retention policy: 'delete' to remove old rows, 'parquet' to archive then delete.", + "type": "string", + "enum": ["delete", "parquet"] + }, + "age": { + "description": "Retention age in hours (default: 24).", + "type": "integer" + }, + "target-kind": { + "description": "Target kind for parquet archiving: 'file' or 's3'.", + "type": "string", + "enum": ["file", "s3"] + }, + "target-path": { + "description": "Filesystem path for parquet file target.", + "type": "string" + }, + "target-endpoint": { + "description": "S3 endpoint URL.", + "type": "string" + }, + "target-bucket": { + "description": "S3 bucket name.", + "type": "string" + }, + "target-access-key": { + "description": "S3 access key.", + "type": "string" + }, + "target-secret-key": { + "description": "S3 secret key.", + "type": "string" + }, + "target-region": { + "description": "S3 region.", + "type": "string" + }, + "target-use-path-style": { + "description": "Use path-style S3 addressing.", + "type": "boolean" + }, + "max-file-size-mb": { + "description": "Maximum parquet file size in MB (default: 128).", + "type": "integer" + } + }, + "required": ["policy"] } } }` diff --git a/internal/repository/node.go b/internal/repository/node.go index 82dcf067..a746182b 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -225,6 +225,71 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt // return nil // } +// NodeStateWithNode combines a node state row with denormalized node info. +type NodeStateWithNode struct { + ID int64 `db:"id"` + TimeStamp int64 `db:"time_stamp"` + NodeState string `db:"node_state"` + HealthState string `db:"health_state"` + HealthMetrics string `db:"health_metrics"` + CpusAllocated int `db:"cpus_allocated"` + MemoryAllocated int64 `db:"memory_allocated"` + GpusAllocated int `db:"gpus_allocated"` + JobsRunning int `db:"jobs_running"` + Hostname string `db:"hostname"` + Cluster string `db:"cluster"` + SubCluster string `db:"subcluster"` +} + +// FindNodeStatesBefore returns all node_state rows with time_stamp < cutoff, +// joined with node info for denormalized archiving. +func (r *NodeRepository) FindNodeStatesBefore(cutoff int64) ([]NodeStateWithNode, error) { + rows, err := sq.Select( + "node_state.id", "node_state.time_stamp", "node_state.node_state", + "node_state.health_state", "node_state.health_metrics", + "node_state.cpus_allocated", "node_state.memory_allocated", + "node_state.gpus_allocated", "node_state.jobs_running", + "node.hostname", "node.cluster", "node.subcluster", + ). + From("node_state"). + Join("node ON node_state.node_id = node.id"). + Where(sq.Lt{"node_state.time_stamp": cutoff}). + Where("node_state.id NOT IN (SELECT MAX(id) FROM node_state GROUP BY node_id)"). + OrderBy("node_state.time_stamp ASC"). + RunWith(r.DB).Query() + if err != nil { + return nil, err + } + defer rows.Close() + + var result []NodeStateWithNode + for rows.Next() { + var ns NodeStateWithNode + if err := rows.Scan(&ns.ID, &ns.TimeStamp, &ns.NodeState, + &ns.HealthState, &ns.HealthMetrics, + &ns.CpusAllocated, &ns.MemoryAllocated, + &ns.GpusAllocated, &ns.JobsRunning, + &ns.Hostname, &ns.Cluster, &ns.SubCluster); err != nil { + return nil, err + } + result = append(result, ns) + } + return result, nil +} + +// DeleteNodeStatesBefore removes node_state rows with time_stamp < cutoff, +// but always preserves the latest row per node_id. +func (r *NodeRepository) DeleteNodeStatesBefore(cutoff int64) (int64, error) { + res, err := r.DB.Exec( + `DELETE FROM node_state WHERE time_stamp < ? AND id NOT IN (SELECT MAX(id) FROM node_state GROUP BY node_id)`, + cutoff, + ) + if err != nil { + return 0, err + } + return res.RowsAffected() +} + func (r *NodeRepository) DeleteNode(id int64) error { _, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id) if err != nil { diff --git a/internal/taskmanager/nodestateRetentionService.go b/internal/taskmanager/nodestateRetentionService.go new file mode 100644 index 00000000..9a704502 --- /dev/null +++ b/internal/taskmanager/nodestateRetentionService.go @@ -0,0 +1,120 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package taskmanager + +import ( + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/repository" + pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/go-co-op/gocron/v2" +) + +func RegisterNodeStateRetentionDeleteService(ageHours int) { + cclog.Info("Register node state retention delete service") + + s.NewJob(gocron.DurationJob(1*time.Hour), + gocron.NewTask( + func() { + cutoff := time.Now().Unix() - int64(ageHours*3600) + nodeRepo := repository.GetNodeRepository() + cnt, err := nodeRepo.DeleteNodeStatesBefore(cutoff) + if err != nil { + cclog.Errorf("NodeState retention: error deleting old rows: %v", err) + } else if cnt > 0 { + cclog.Infof("NodeState retention: deleted %d old rows", cnt) + } + })) +} + +func RegisterNodeStateRetentionParquetService(cfg *config.NodeStateRetention) { + cclog.Info("Register node state retention parquet service") + + maxFileSizeMB := cfg.MaxFileSizeMB + if maxFileSizeMB <= 0 { + maxFileSizeMB = 128 + } + + ageHours := cfg.Age + if ageHours <= 0 { + ageHours = 24 + } + + var target pqarchive.ParquetTarget + var err error + + switch cfg.TargetKind { + case "s3": + target, err = pqarchive.NewS3Target(pqarchive.S3TargetConfig{ + Endpoint: cfg.TargetEndpoint, + Bucket: cfg.TargetBucket, + AccessKey: cfg.TargetAccessKey, + SecretKey: cfg.TargetSecretKey, + Region: cfg.TargetRegion, + UsePathStyle: cfg.TargetUsePathStyle, + }) + default: + target, err = pqarchive.NewFileTarget(cfg.TargetPath) + } + + if err != nil { + cclog.Errorf("NodeState parquet retention: failed to create target: %v", err) + return + } + + s.NewJob(gocron.DurationJob(1*time.Hour), + gocron.NewTask( + func() { + cutoff := time.Now().Unix() - int64(ageHours*3600) + nodeRepo := repository.GetNodeRepository() + + rows, err := nodeRepo.FindNodeStatesBefore(cutoff) + if err != nil { + cclog.Errorf("NodeState parquet retention: error finding rows: %v", err) + return + } + if len(rows) == 0 { + return + } + + cclog.Infof("NodeState parquet retention: archiving %d rows", len(rows)) + pw := pqarchive.NewNodeStateParquetWriter(target, maxFileSizeMB) + + for _, ns := range rows { + row := pqarchive.ParquetNodeStateRow{ + TimeStamp: ns.TimeStamp, + NodeState: ns.NodeState, + HealthState: ns.HealthState, + HealthMetrics: ns.HealthMetrics, + CpusAllocated: int32(ns.CpusAllocated), + MemoryAllocated: ns.MemoryAllocated, + GpusAllocated: int32(ns.GpusAllocated), + JobsRunning: int32(ns.JobsRunning), + Hostname: ns.Hostname, + Cluster: ns.Cluster, + SubCluster: ns.SubCluster, + } + if err := pw.AddRow(row); err != nil { + cclog.Errorf("NodeState parquet retention: add row: %v", err) + continue + } + } + + if err := pw.Close(); err != nil { + cclog.Errorf("NodeState parquet retention: close writer: %v", err) + return + } + + cnt, err := nodeRepo.DeleteNodeStatesBefore(cutoff) + if err != nil { + cclog.Errorf("NodeState parquet retention: error deleting rows: %v", err) + } else { + cclog.Infof("NodeState parquet retention: deleted %d rows from db", cnt) + } + })) +} diff --git a/internal/taskmanager/taskManager.go b/internal/taskmanager/taskManager.go index e323557b..8cf6b4e6 100644 --- a/internal/taskmanager/taskManager.go +++ b/internal/taskmanager/taskManager.go @@ -144,9 +144,30 @@ func Start(cronCfg, archiveConfig json.RawMessage) { RegisterUpdateDurationWorker() RegisterCommitJobService() + if config.Keys.NodeStateRetention != nil && config.Keys.NodeStateRetention.Policy != "" { + initNodeStateRetention() + } + s.Start() } +func initNodeStateRetention() { + cfg := config.Keys.NodeStateRetention + age := cfg.Age + if age <= 0 { + age = 24 + } + + switch cfg.Policy { + case "delete": + RegisterNodeStateRetentionDeleteService(age) + case "parquet": + RegisterNodeStateRetentionParquetService(cfg) + default: + cclog.Warnf("Unknown nodestate-retention policy: %s", cfg.Policy) + } +} + // Shutdown stops the task manager and its scheduler. func Shutdown() { if s != nil { diff --git a/pkg/archive/parquet/nodestate_schema.go b/pkg/archive/parquet/nodestate_schema.go new file mode 100644 index 00000000..c9dfe363 --- /dev/null +++ b/pkg/archive/parquet/nodestate_schema.go @@ -0,0 +1,20 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +type ParquetNodeStateRow struct { + TimeStamp int64 `parquet:"time_stamp"` + NodeState string `parquet:"node_state"` + HealthState string `parquet:"health_state"` + HealthMetrics string `parquet:"health_metrics,optional"` + CpusAllocated int32 `parquet:"cpus_allocated"` + MemoryAllocated int64 `parquet:"memory_allocated"` + GpusAllocated int32 `parquet:"gpus_allocated"` + JobsRunning int32 `parquet:"jobs_running"` + Hostname string `parquet:"hostname"` + Cluster string `parquet:"cluster"` + SubCluster string `parquet:"subcluster"` +} diff --git a/pkg/archive/parquet/nodestate_writer.go b/pkg/archive/parquet/nodestate_writer.go new file mode 100644 index 00000000..053417d6 --- /dev/null +++ b/pkg/archive/parquet/nodestate_writer.go @@ -0,0 +1,104 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "fmt" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + pq "github.com/parquet-go/parquet-go" +) + +// NodeStateParquetWriter batches ParquetNodeStateRows and flushes them to a target +// when the estimated size exceeds maxSizeBytes. +type NodeStateParquetWriter struct { + target ParquetTarget + maxSizeBytes int64 + rows []ParquetNodeStateRow + currentSize int64 + fileCounter int + datePrefix string +} + +// NewNodeStateParquetWriter creates a new writer for node state parquet files. +func NewNodeStateParquetWriter(target ParquetTarget, maxSizeMB int) *NodeStateParquetWriter { + return &NodeStateParquetWriter{ + target: target, + maxSizeBytes: int64(maxSizeMB) * 1024 * 1024, + datePrefix: time.Now().Format("2006-01-02"), + } +} + +// AddRow adds a row to the current batch. If the estimated batch size +// exceeds the configured maximum, the batch is flushed first. +func (pw *NodeStateParquetWriter) AddRow(row ParquetNodeStateRow) error { + rowSize := estimateNodeStateRowSize(&row) + + if pw.currentSize+rowSize > pw.maxSizeBytes && len(pw.rows) > 0 { + if err := pw.Flush(); err != nil { + return err + } + } + + pw.rows = append(pw.rows, row) + pw.currentSize += rowSize + return nil +} + +// Flush writes the current batch to a parquet file on the target. +func (pw *NodeStateParquetWriter) Flush() error { + if len(pw.rows) == 0 { + return nil + } + + pw.fileCounter++ + fileName := fmt.Sprintf("cc-nodestate-%s-%03d.parquet", pw.datePrefix, pw.fileCounter) + + data, err := writeNodeStateParquetBytes(pw.rows) + if err != nil { + return fmt.Errorf("write parquet buffer: %w", err) + } + + if err := pw.target.WriteFile(fileName, data); err != nil { + return fmt.Errorf("write parquet file %q: %w", fileName, err) + } + + cclog.Infof("NodeState retention: wrote %s (%d rows, %d bytes)", fileName, len(pw.rows), len(data)) + pw.rows = pw.rows[:0] + pw.currentSize = 0 + return nil +} + +// Close flushes any remaining rows and finalizes the writer. +func (pw *NodeStateParquetWriter) Close() error { + return pw.Flush() +} + +func writeNodeStateParquetBytes(rows []ParquetNodeStateRow) ([]byte, error) { + var buf bytes.Buffer + + writer := pq.NewGenericWriter[ParquetNodeStateRow](&buf, + pq.Compression(&pq.Snappy), + ) + + if _, err := writer.Write(rows); err != nil { + return nil, err + } + if err := writer.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +func estimateNodeStateRowSize(row *ParquetNodeStateRow) int64 { + size := int64(100) // fixed numeric fields + size += int64(len(row.NodeState) + len(row.HealthState) + len(row.HealthMetrics)) + size += int64(len(row.Hostname) + len(row.Cluster) + len(row.SubCluster)) + return size +} From f016bd42325911e90122eee46ac77ac6cdd56908 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 12 Feb 2026 09:30:14 +0100 Subject: [PATCH 26/31] Extend node repository unit tests --- go.mod | 2 - go.sum | 2 + internal/repository/node.go | 10 ++- internal/repository/node_test.go | 146 ++++++++++++++++++++++++++++++- 4 files changed, 151 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index b35eafe5..f9bf7e42 100644 --- a/go.mod +++ b/go.mod @@ -124,5 +124,3 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) - -replace github.com/ClusterCockpit/cc-lib/v2 => ../cc-lib diff --git a/go.sum b/go.sum index d5bbe045..509c659c 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,8 @@ github.com/99designs/gqlgen v0.17.85 h1:EkGx3U2FDcxQm8YDLQSpXIAVmpDyZ3IcBMOJi2nH github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= +github.com/ClusterCockpit/cc-lib/v2 v2.4.0 h1:OnZlvqSatg7yCQ2NtSR7AddpUVSiuSMZ8scF1a7nfOk= +github.com/ClusterCockpit/cc-lib/v2 v2.4.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= diff --git a/internal/repository/node.go b/internal/repository/node.go index a746182b..08a694c6 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -254,7 +254,7 @@ func (r *NodeRepository) FindNodeStatesBefore(cutoff int64) ([]NodeStateWithNode From("node_state"). Join("node ON node_state.node_id = node.id"). Where(sq.Lt{"node_state.time_stamp": cutoff}). - Where("node_state.id NOT IN (SELECT MAX(id) FROM node_state GROUP BY node_id)"). + Where("node_state.id NOT IN (SELECT ns2.id FROM node_state ns2 WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id))"). OrderBy("node_state.time_stamp ASC"). RunWith(r.DB).Query() if err != nil { @@ -278,10 +278,14 @@ func (r *NodeRepository) FindNodeStatesBefore(cutoff int64) ([]NodeStateWithNode } // DeleteNodeStatesBefore removes node_state rows with time_stamp < cutoff, -// but always preserves the latest row per node_id. +// but always preserves the row with the latest timestamp per node_id. func (r *NodeRepository) DeleteNodeStatesBefore(cutoff int64) (int64, error) { res, err := r.DB.Exec( - `DELETE FROM node_state WHERE time_stamp < ? AND id NOT IN (SELECT MAX(id) FROM node_state GROUP BY node_id)`, + `DELETE FROM node_state WHERE time_stamp < ? + AND id NOT IN ( + SELECT id FROM node_state ns2 + WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id) + )`, cutoff, ) if err != nil { diff --git a/internal/repository/node_test.go b/internal/repository/node_test.go index 4286ab34..d1e86b9a 100644 --- a/internal/repository/node_test.go +++ b/internal/repository/node_test.go @@ -156,8 +156,12 @@ func nodeTestSetup(t *testing.T) { func TestUpdateNodeState(t *testing.T) { nodeTestSetup(t) + repo := GetNodeRepository() + now := time.Now().Unix() + nodeState := schema.NodeStateDB{ - TimeStamp: time.Now().Unix(), NodeState: "allocated", + TimeStamp: now, + NodeState: "allocated", CpusAllocated: 72, MemoryAllocated: 480, GpusAllocated: 0, @@ -165,18 +169,152 @@ func TestUpdateNodeState(t *testing.T) { JobsRunning: 1, } - repo := GetNodeRepository() err := repo.UpdateNodeState("host124", "testcluster", &nodeState) if err != nil { - return + t.Fatal(err) } node, err := repo.GetNode("host124", "testcluster", false) if err != nil { - return + t.Fatal(err) } if node.NodeState != "allocated" { t.Errorf("wrong node state\ngot: %s \nwant: allocated ", node.NodeState) } + + t.Run("FindBeforeEmpty", func(t *testing.T) { + // Only the current-timestamp row exists, so nothing should be found before now + rows, err := repo.FindNodeStatesBefore(now) + if err != nil { + t.Fatal(err) + } + if len(rows) != 0 { + t.Errorf("expected 0 rows, got %d", len(rows)) + } + }) + + t.Run("DeleteOldRows", func(t *testing.T) { + // Insert 2 more old rows for host124 + for i, ts := range []int64{now - 7200, now - 3600} { + ns := schema.NodeStateDB{ + TimeStamp: ts, + NodeState: "allocated", + HealthState: schema.MonitoringStateFull, + CpusAllocated: 72, + MemoryAllocated: 480, + JobsRunning: i, + } + if err := repo.UpdateNodeState("host124", "testcluster", &ns); err != nil { + t.Fatal(err) + } + } + + // Delete rows older than 30 minutes + cutoff := now - 1800 + cnt, err := repo.DeleteNodeStatesBefore(cutoff) + if err != nil { + t.Fatal(err) + } + + // Should delete the 2 old rows + if cnt != 2 { + t.Errorf("expected 2 deleted rows, got %d", cnt) + } + + // Latest row should still exist + node, err := repo.GetNode("host124", "testcluster", false) + if err != nil { + t.Fatal(err) + } + if node.NodeState != "allocated" { + t.Errorf("expected node state 'allocated', got %s", node.NodeState) + } + }) + + t.Run("PreservesLatestPerNode", func(t *testing.T) { + // Insert a single old row for host125 — it's the latest per node so it must survive + ns := schema.NodeStateDB{ + TimeStamp: now - 7200, + NodeState: "idle", + HealthState: schema.MonitoringStateFull, + CpusAllocated: 0, + MemoryAllocated: 0, + JobsRunning: 0, + } + if err := repo.UpdateNodeState("host125", "testcluster", &ns); err != nil { + t.Fatal(err) + } + + // Delete everything older than now — the latest per node should be preserved + _, err := repo.DeleteNodeStatesBefore(now) + if err != nil { + t.Fatal(err) + } + + // The latest row for host125 must still exist + node, err := repo.GetNode("host125", "testcluster", false) + if err != nil { + t.Fatal(err) + } + if node.NodeState != "idle" { + t.Errorf("expected node state 'idle', got %s", node.NodeState) + } + + // Verify exactly 1 row remains for host125 + var countAfter int + if err := repo.DB.QueryRow( + "SELECT COUNT(*) FROM node_state WHERE node_id = (SELECT id FROM node WHERE hostname = 'host125')"). + Scan(&countAfter); err != nil { + t.Fatal(err) + } + if countAfter != 1 { + t.Errorf("expected 1 row remaining for host125, got %d", countAfter) + } + }) + + t.Run("FindBeforeWithJoin", func(t *testing.T) { + // Insert old and current rows for host123 + for _, ts := range []int64{now - 7200, now} { + ns := schema.NodeStateDB{ + TimeStamp: ts, + NodeState: "allocated", + HealthState: schema.MonitoringStateFull, + CpusAllocated: 8, + MemoryAllocated: 1024, + GpusAllocated: 1, + JobsRunning: 1, + } + if err := repo.UpdateNodeState("host123", "testcluster", &ns); err != nil { + t.Fatal(err) + } + } + + // Find rows older than 30 minutes, excluding latest per node + cutoff := now - 1800 + rows, err := repo.FindNodeStatesBefore(cutoff) + if err != nil { + t.Fatal(err) + } + + // Should find the old host123 row + found := false + for _, row := range rows { + if row.Hostname == "host123" && row.TimeStamp == now-7200 { + found = true + if row.Cluster != "testcluster" { + t.Errorf("expected cluster 'testcluster', got %s", row.Cluster) + } + if row.SubCluster != "sc1" { + t.Errorf("expected subcluster 'sc1', got %s", row.SubCluster) + } + if row.CpusAllocated != 8 { + t.Errorf("expected cpus_allocated 8, got %d", row.CpusAllocated) + } + } + } + if !found { + t.Errorf("expected to find old host123 row among %d results", len(rows)) + } + }) } From 48729b172df47e56aff753cf4084a7fe792ad36e Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 12 Feb 2026 14:27:41 +0100 Subject: [PATCH 27/31] improve nodeList loading indicator, streamlining --- web/frontend/src/generic/JobList.svelte | 2 +- web/frontend/src/systems/NodeList.svelte | 20 +-- .../src/systems/nodelist/NodeListRow.svelte | 134 ++++++++++-------- 3 files changed, 83 insertions(+), 73 deletions(-) diff --git a/web/frontend/src/generic/JobList.svelte b/web/frontend/src/generic/JobList.svelte index 278f189e..3ccbb560 100644 --- a/web/frontend/src/generic/JobList.svelte +++ b/web/frontend/src/generic/JobList.svelte @@ -305,7 +305,7 @@ {#if $jobsStore.fetching || !$jobsStore.data} -
+
diff --git a/web/frontend/src/systems/NodeList.svelte b/web/frontend/src/systems/NodeList.svelte index da196b82..2e342168 100644 --- a/web/frontend/src/systems/NodeList.svelte +++ b/web/frontend/src/systems/NodeList.svelte @@ -104,7 +104,7 @@ let itemsPerPage = $derived(usePaging ? (ccconfig?.nodeList_nodesPerPage || 10) : 10); let paging = $derived({ itemsPerPage, page }); - const nodesQuery = $derived(queryStore({ + const nodesStore = $derived(queryStore({ client: client, query: nodeListQuery, variables: { @@ -122,7 +122,7 @@ requestPolicy: "network-only", // Resolution queries are cached, but how to access them? For now: reload on every change })); - const matchedNodes = $derived($nodesQuery?.data?.nodeMetricsList?.totalNodes || 0); + const matchedNodes = $derived($nodesStore?.data?.nodeMetricsList?.totalNodes || 0); /* Effects */ $effect(() => { @@ -135,7 +135,7 @@ } = document.documentElement; // Add 100 px offset to trigger load earlier - if (scrollTop + clientHeight >= scrollHeight - 100 && $nodesQuery?.data?.nodeMetricsList?.hasNextPage) { + if (scrollTop + clientHeight >= scrollHeight - 100 && $nodesStore?.data?.nodeMetricsList?.hasNextPage) { page += 1 }; }); @@ -143,9 +143,9 @@ }); $effect(() => { - if ($nodesQuery?.data) { + if ($nodesStore?.data) { untrack(() => { - handleNodes($nodesQuery?.data?.nodeMetricsList?.items); + handleNodes($nodesStore?.data?.nodeMetricsList?.items); }); selectedMetrics = [...pendingSelectedMetrics]; // Trigger Rerender in NodeListRow Only After Data is Fetched }; @@ -228,7 +228,7 @@ style="padding-top: {headerPaddingTop}px;" > {cluster} Node Info - {#if $nodesQuery.fetching} + {#if $nodesStore.fetching} {/if} @@ -245,22 +245,22 @@ - {#if $nodesQuery.error} + {#if $nodesStore.error} - {$nodesQuery.error.message} + {$nodesStore.error.message} {:else} {#each nodes as nodeData (nodeData.host)} - + {:else} No nodes found {/each} {/if} - {#if $nodesQuery.fetching || !$nodesQuery.data} + {#if $nodesStore.fetching || !$nodesStore.data}
diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index 46f8c4a4..1fca83f2 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -4,6 +4,7 @@ Properties: - `cluster String`: The nodes' cluster - `nodeData Object`: The node data object including metric data + - `nodeDataFetching Bool`: Whether the metric query still runs - `selectedMetrics [String]`: The array of selected metrics - `globalMetrics [Obj]`: Includes the backend supplied availabilities for cluster and subCluster --> @@ -24,6 +25,7 @@ let { cluster, nodeData, + nodeDataFetching, selectedMetrics, globalMetrics } = $props(); @@ -72,7 +74,7 @@ ); const extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null); - const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(selectedMetrics, nodeData.metrics) : []); + const refinedData = $derived(!nodeDataFetching ? sortAndSelectScope(selectedMetrics, nodeData.metrics) : []); const dataHealth = $derived(refinedData.filter((rd) => rd.availability == "configured").map((enabled) => (enabled?.data?.metric?.series?.length > 0))); /* Functions */ @@ -150,65 +152,73 @@ hoststate={nodeData?.state? nodeData.state: 'notindb'}/> {/if} - {#each refinedData as metricData, i (metricData?.data?.name || i)} - {#key metricData} - - {#if metricData?.availability == "none"} - -

No dataset(s) returned for {selectedMetrics[i]}

-

Metric is not configured for cluster {cluster}.

-
- {:else if metricData?.availability == "disabled"} - -

No dataset(s) returned for {selectedMetrics[i]}

-

Metric has been disabled for subcluster {nodeData.subCluster}.

-
- {:else if !metricData?.data} - -

No dataset(s) returned for {selectedMetrics[i]}

-

Metric or host was not found in metric store for cluster {cluster}.

-
- {:else if !!metricData.data?.metric.statisticsSeries} - - -
- - {:else} - - {/if} - - {/key} - {/each} + {#if nodeDataFetching} + +
+ +
+ + {:else} + {#each refinedData as metricData, i (metricData?.data?.name || i)} + {#key metricData} + + {#if metricData?.availability == "none"} + +

No dataset(s) returned for {selectedMetrics[i]}

+

Metric is not configured for cluster {cluster}.

+
+ {:else if metricData?.availability == "disabled"} + +

No dataset(s) returned for {selectedMetrics[i]}

+

Metric has been disabled for subcluster {nodeData.subCluster}.

+
+ {:else if !metricData?.data} + +

No dataset(s) returned for {selectedMetrics[i]}

+

Metric or host was not found in metric store for cluster {cluster}.

+
+ {:else if !!metricData.data?.metric.statisticsSeries} + + +
+ + {:else} + + {/if} + + {/key} + {/each} + {/if} From c15f1117f553010e6e9c55331f45e3dcd2ab1c71 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 12 Feb 2026 15:45:15 +0100 Subject: [PATCH 28/31] Review and improve node repo queries --- .../migrations/sqlite3/10_node-table.up.sql | 1 + internal/repository/node.go | 226 +++++++----------- 2 files changed, 83 insertions(+), 144 deletions(-) diff --git a/internal/repository/migrations/sqlite3/10_node-table.up.sql b/internal/repository/migrations/sqlite3/10_node-table.up.sql index fd118f5d..b788a8a9 100644 --- a/internal/repository/migrations/sqlite3/10_node-table.up.sql +++ b/internal/repository/migrations/sqlite3/10_node-table.up.sql @@ -38,6 +38,7 @@ CREATE INDEX IF NOT EXISTS nodestates_state_timestamp ON node_state (node_state, CREATE INDEX IF NOT EXISTS nodestates_health_timestamp ON node_state (health_state, time_stamp); CREATE INDEX IF NOT EXISTS nodestates_nodeid_state ON node_state (node_id, node_state); CREATE INDEX IF NOT EXISTS nodestates_nodeid_health ON node_state (node_id, health_state); +CREATE INDEX IF NOT EXISTS nodestates_nodeid_timestamp ON node_state (node_id, time_stamp DESC); -- Add NEW Indices For Increased Amounts of Tags CREATE INDEX IF NOT EXISTS tags_jobid ON jobtag (job_id); diff --git a/internal/repository/node.go b/internal/repository/node.go index 08a694c6..2ffe6698 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -52,6 +52,38 @@ func GetNodeRepository() *NodeRepository { return nodeRepoInstance } +// latestStateCondition returns a squirrel expression that restricts node_state +// rows to the latest per node_id using a correlated subquery. +// Requires the query to join node and node_state tables. +func latestStateCondition() sq.Sqlizer { + return sq.Expr( + "node_state.id = (SELECT ns2.id FROM node_state ns2 WHERE ns2.node_id = node.id ORDER BY ns2.time_stamp DESC LIMIT 1)", + ) +} + +// applyNodeFilters applies common NodeFilter conditions to a query that joins +// the node and node_state tables with latestStateCondition. +func applyNodeFilters(query sq.SelectBuilder, filters []*model.NodeFilter) sq.SelectBuilder { + for _, f := range filters { + if f.Cluster != nil { + query = buildStringCondition("node.cluster", f.Cluster, query) + } + if f.SubCluster != nil { + query = buildStringCondition("node.subcluster", f.SubCluster, query) + } + if f.Hostname != nil { + query = buildStringCondition("node.hostname", f.Hostname, query) + } + if f.SchedulerState != nil { + query = query.Where("node_state.node_state = ?", f.SchedulerState) + } + if f.HealthState != nil { + query = query.Where("node_state.health_state = ?", f.HealthState) + } + } + return query +} + func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[string]string, error) { start := time.Now() @@ -82,17 +114,16 @@ func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[str func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) (*schema.Node, error) { node := &schema.Node{} - var timestamp int - if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", - "node_state.health_state", "MAX(node_state.time_stamp) as time"). - From("node_state"). - Join("node ON node_state.node_id = node.id"). + if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). + From("node"). + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.hostname = ?", hostname). Where("node.cluster = ?", cluster). - GroupBy("node_state.node_id"). RunWith(r.DB). - QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil { - cclog.Warnf("Error while querying node '%s' at time '%d' from database: %v", hostname, timestamp, err) + QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + cclog.Warnf("Error while querying node '%s' from database: %v", hostname, err) return nil, err } @@ -111,16 +142,15 @@ func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) func (r *NodeRepository) GetNodeByID(id int64, withMeta bool) (*schema.Node, error) { node := &schema.Node{} - var timestamp int - if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", - "node_state.health_state", "MAX(node_state.time_stamp) as time"). - From("node_state"). - Join("node ON node_state.node_id = node.id"). + if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). + From("node"). + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.id = ?", id). - GroupBy("node_state.node_id"). RunWith(r.DB). - QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil { - cclog.Warnf("Error while querying node ID '%d' at time '%d' from database: %v", id, timestamp, err) + QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + cclog.Warnf("Error while querying node ID '%d' from database: %v", id, err) return nil, err } @@ -313,40 +343,17 @@ func (r *NodeRepository) QueryNodes( order *model.OrderByInput, // Currently unused! ) ([]*schema.Node, error) { query, qerr := AccessCheck(ctx, - sq.Select("hostname", "cluster", "subcluster", "node_state", "health_state", "MAX(time_stamp) as time"). + sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). From("node"). - Join("node_state ON node_state.node_id = node.id")) + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition())) if qerr != nil { return nil, qerr } - for _, f := range filters { - if f.Cluster != nil { - query = buildStringCondition("cluster", f.Cluster, query) - } - if f.SubCluster != nil { - query = buildStringCondition("subcluster", f.SubCluster, query) - } - if f.Hostname != nil { - query = buildStringCondition("hostname", f.Hostname, query) - } - if f.SchedulerState != nil { - query = query.Where("node_state = ?", f.SchedulerState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - // TODO: Hardcoded TimeDiff Suboptimal - Use Config Option? - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 300)}) - } - if f.HealthState != nil { - query = query.Where("health_state = ?", f.HealthState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - // TODO: Hardcoded TimeDiff Suboptimal - Use Config Option? - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 300)}) - } - } - - query = query.GroupBy("node_id").OrderBy("hostname ASC") + query = applyNodeFilters(query, filters) + query = query.OrderBy("node.hostname ASC") if page != nil && page.ItemsPerPage != -1 { limit := uint64(page.ItemsPerPage) @@ -363,11 +370,10 @@ func (r *NodeRepository) QueryNodes( nodes := make([]*schema.Node, 0) for rows.Next() { node := schema.Node{} - var timestamp int if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster, - &node.NodeState, &node.HealthState, ×tamp); err != nil { + &node.NodeState, &node.HealthState); err != nil { rows.Close() - cclog.Warnf("Error while scanning rows (QueryNodes) at time '%d'", timestamp) + cclog.Warn("Error while scanning rows (QueryNodes)") return nil, err } nodes = append(nodes, &node) @@ -377,74 +383,39 @@ func (r *NodeRepository) QueryNodes( } // CountNodes returns the total matched nodes based on a node filter. It always operates -// on the last state (largest timestamp). +// on the last state (largest timestamp) per node. func (r *NodeRepository) CountNodes( ctx context.Context, filters []*model.NodeFilter, ) (int, error) { query, qerr := AccessCheck(ctx, - sq.Select("time_stamp", "count(*) as countRes"). + sq.Select("COUNT(*)"). From("node"). - Join("node_state ON node_state.node_id = node.id")) + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition())) if qerr != nil { return 0, qerr } - for _, f := range filters { - if f.Cluster != nil { - query = buildStringCondition("cluster", f.Cluster, query) - } - if f.SubCluster != nil { - query = buildStringCondition("subcluster", f.SubCluster, query) - } - if f.Hostname != nil { - query = buildStringCondition("hostname", f.Hostname, query) - } - if f.SchedulerState != nil { - query = query.Where("node_state = ?", f.SchedulerState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - // TODO: Hardcoded TimeDiff Suboptimal - Use Config Option? - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 300)}) - } - if f.HealthState != nil { - query = query.Where("health_state = ?", f.HealthState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - // TODO: Hardcoded TimeDiff Suboptimal - Use Config Option? - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 300)}) - } - } + query = applyNodeFilters(query, filters) - query = query.GroupBy("time_stamp").OrderBy("time_stamp DESC").Limit(1) - - rows, err := query.RunWith(r.stmtCache).Query() - if err != nil { + var count int + if err := query.RunWith(r.stmtCache).QueryRow().Scan(&count); err != nil { queryString, queryVars, _ := query.ToSql() cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err) return 0, err } - var totalNodes int - for rows.Next() { - var timestamp int - if err := rows.Scan(×tamp, &totalNodes); err != nil { - rows.Close() - cclog.Warnf("Error while scanning rows (CountNodes) at time '%d'", timestamp) - return 0, err - } - } - - return totalNodes, nil + return count, nil } func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { - q := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", - "node_state.health_state", "MAX(node_state.time_stamp) as time"). + q := sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). From("node"). Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.cluster = ?", cluster). - GroupBy("node_state.node_id"). OrderBy("node.hostname ASC") rows, err := q.RunWith(r.DB).Query() @@ -456,10 +427,9 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { defer rows.Close() for rows.Next() { node := &schema.Node{} - var timestamp int if err := rows.Scan(&node.Hostname, &node.Cluster, - &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil { - cclog.Warnf("Error while scanning node list (ListNodes) at time '%d'", timestamp) + &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + cclog.Warn("Error while scanning node list (ListNodes)") return nil, err } @@ -470,11 +440,11 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { } func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { - q := sq.Select("node.hostname", "node_state.node_state", "MAX(node_state.time_stamp) as time"). + q := sq.Select("node.hostname", "node_state.node_state"). From("node"). Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.cluster = ?", cluster). - GroupBy("node_state.node_id"). OrderBy("node.hostname ASC") rows, err := q.RunWith(r.DB).Query() @@ -487,9 +457,8 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { defer rows.Close() for rows.Next() { var hostname, nodestate string - var timestamp int - if err := rows.Scan(&hostname, &nodestate, ×tamp); err != nil { - cclog.Warnf("Error while scanning node list (MapNodes) at time '%d'", timestamp) + if err := rows.Scan(&hostname, &nodestate); err != nil { + cclog.Warn("Error while scanning node list (MapNodes)") return nil, err } @@ -500,33 +469,16 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { } func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeFilter, column string) ([]*model.NodeStates, error) { - query, qerr := AccessCheck(ctx, sq.Select("hostname", column, "MAX(time_stamp) as time").From("node")) + query, qerr := AccessCheck(ctx, + sq.Select(column). + From("node"). + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition())) if qerr != nil { return nil, qerr } - query = query.Join("node_state ON node_state.node_id = node.id") - - for _, f := range filters { - if f.Hostname != nil { - query = buildStringCondition("hostname", f.Hostname, query) - } - if f.Cluster != nil { - query = buildStringCondition("cluster", f.Cluster, query) - } - if f.SubCluster != nil { - query = buildStringCondition("subcluster", f.SubCluster, query) - } - if f.SchedulerState != nil { - query = query.Where("node_state = ?", f.SchedulerState) - } - if f.HealthState != nil { - query = query.Where("health_state = ?", f.HealthState) - } - } - - // Add Group and Order - query = query.GroupBy("hostname").OrderBy("hostname DESC") + query = applyNodeFilters(query, filters) rows, err := query.RunWith(r.stmtCache).Query() if err != nil { @@ -537,12 +489,10 @@ func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeF stateMap := map[string]int{} for rows.Next() { - var hostname, state string - var timestamp int - - if err := rows.Scan(&hostname, &state, ×tamp); err != nil { + var state string + if err := rows.Scan(&state); err != nil { rows.Close() - cclog.Warnf("Error while scanning rows (CountStates) at time '%d'", timestamp) + cclog.Warn("Error while scanning rows (CountStates)") return nil, err } @@ -735,26 +685,14 @@ func (r *NodeRepository) GetNodesForList( } } else { - // DB Nodes: Count and Find Next Page + // DB Nodes: Count and derive hasNextPage from count var cerr error countNodes, cerr = r.CountNodes(ctx, queryFilters) if cerr != nil { cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)") return nil, nil, 0, false, cerr } - - // Example Page 4 @ 10 IpP : Does item 41 exist? - // Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 exists. - nextPage := &model.PageRequest{ - ItemsPerPage: 1, - Page: ((page.Page * page.ItemsPerPage) + 1), - } - nextNodes, err := r.QueryNodes(ctx, queryFilters, nextPage, nil) // Order not Used - if err != nil { - cclog.Warn("Error while querying next nodes") - return nil, nil, 0, false, err - } - hasNextPage = len(nextNodes) == 1 + hasNextPage = page.Page*page.ItemsPerPage < countNodes } // Fallback for non-init'd node table in DB; Ignores stateFilter From 3215bc3de0a09888db800770da8d59a718a166e3 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 13 Feb 2026 11:58:52 +0100 Subject: [PATCH 29/31] review loading indicators in nodeList --- web/frontend/src/systems/NodeList.svelte | 21 ++- .../src/systems/nodelist/NodeInfo.svelte | 15 +- .../src/systems/nodelist/NodeListRow.svelte | 138 +++++++++--------- 3 files changed, 95 insertions(+), 79 deletions(-) diff --git a/web/frontend/src/systems/NodeList.svelte b/web/frontend/src/systems/NodeList.svelte index 2e342168..403a8030 100644 --- a/web/frontend/src/systems/NodeList.svelte +++ b/web/frontend/src/systems/NodeList.svelte @@ -152,12 +152,21 @@ }); $effect(() => { - // Triggers (Except Paging) + // Update NodeListRows metrics only: Keep ordered nodes on page 1 from, to pendingSelectedMetrics, selectedResolution + // Continous Scroll: Paging if parameters change: Existing entries will not match new selections + if (!usePaging) { + nodes = []; + page = 1; + } + }); + + $effect(() => { + // Update NodeListRows metrics only: Keep ordered nodes on page 1 hostnameFilter, hoststateFilter // Continous Scroll: Paging if parameters change: Existing entries will not match new selections - // Nodes Array Reset in HandleNodes func + nodes = []; if (!usePaging) { page = 1; } @@ -255,9 +264,11 @@ {#each nodes as nodeData (nodeData.host)} {:else} - - No nodes found - + {#if !$nodesStore.fetching} + + No nodes found + + {/if} {/each} {/if} {#if $nodesStore.fetching || !$nodesStore.data} diff --git a/web/frontend/src/systems/nodelist/NodeInfo.svelte b/web/frontend/src/systems/nodelist/NodeInfo.svelte index 39716ca2..4b616f10 100644 --- a/web/frontend/src/systems/nodelist/NodeInfo.svelte +++ b/web/frontend/src/systems/nodelist/NodeInfo.svelte @@ -51,6 +51,8 @@ /* Derived */ // Not at least one returned, selected metric: NodeHealth warning + const fetchInfo = $derived(dataHealth.includes('fetching')); + // Not at least one returned, selected metric: NodeHealth warning const healthWarn = $derived(!dataHealth.includes(true)); // At least one non-returned selected metric: Metric config error? const metricWarn = $derived(dataHealth.includes(false)); @@ -84,10 +86,17 @@ - {#if healthWarn} + {#if fetchInfo} + + + + + {:else if healthWarn} - Jobs + Info