From dd23f493642aa9ba7491d80ec4c123599696dcb0 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 28 Jan 2026 12:59:05 +0100 Subject: [PATCH] Add configuration options and enable dynamic memory management through cc-backend callback --- CLAUDE.md | 77 ++++++++++++++++++++++++++++++++++ cmd/cc-metric-store/main.go | 22 +++++++--- go.mod | 11 +---- go.sum | 20 ++------- internal/api/nodeprovider.go | 54 ++++++++++++++++++++++++ internal/config/config.go | 80 +++++++++++++++--------------------- internal/config/schema.go | 4 ++ 7 files changed, 191 insertions(+), 77 deletions(-) create mode 100644 CLAUDE.md create mode 100644 internal/api/nodeprovider.go diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..f0048a3 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,77 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +cc-metric-store is an in-memory time-series database for HPC cluster metrics, part of the ClusterCockpit monitoring suite. Data is indexed by a hierarchical tree (cluster → host → socket/cpu/gpu) and accessed via selectors. The core storage engine lives in `cc-backend/pkg/metricstore`; this repo provides the HTTP API wrapper. + +## Build Commands + +```bash +make # Build binary, copy config template, create checkpoint dirs +make clean # Clean build cache and binary +make distclean # Also remove ./var and config.json +make swagger # Regenerate Swagger from source comments +make test # Run go build, go vet, go test +``` + +## Testing + +```bash +go test -v ./... # Run tests +go test -bench=. -race -v ./... # With benchmarks and race detector +``` + +Integration test scripts in `/endpoint-test-scripts/` for manual API testing. + +## Running + +```bash +./cc-metric-store # Uses ./config.json +./cc-metric-store -config /path/to/config.json +./cc-metric-store -dev # Enable Swagger UI at /swagger/ +./cc-metric-store -loglevel debug # debug|info|warn|err|crit +``` + +## Architecture + +**Entry point:** `cmd/cc-metric-store/main.go` +- `run()` → parse flags, init logging/config, connect NATS +- `runServer()` → init metricstore from cc-backend, start HTTP server + +**Key packages:** +- `internal/api/` - REST endpoints (query, write, free, debug, healthcheck) and JWT auth (Ed25519) +- `internal/config/` - Config loading and JSON schema validation +- External: `cc-backend/pkg/metricstore` - actual time-series storage engine + +**API endpoints** (all support optional JWT auth): +- `GET /api/query/` - Query metrics with selectors +- `POST /api/write/` - Write metrics (InfluxDB line protocol) +- `POST /api/free/` - Free buffers up to timestamp +- `GET /api/debug/` - Dump internal state +- `GET /api/healthcheck/` - Node health status + +## Selectors + +Data is accessed via hierarchical selectors: +``` +["cluster1", "host1", "cpu0"] # Specific CPU +["cluster1", "host1", ["cpu4", "cpu5"]] # Multiple CPUs +["cluster1", "host1"] # Entire node (all CPUs implied) +``` + +## Configuration + +Config file structure (see `configs/config.json`): +- `main` - Server address, TLS certs, JWT public key, user/group for privilege drop +- `metrics` - Per-metric frequency and aggregation strategy (sum/avg/null) +- `metric-store` - Checkpoints, memory cap, retention, cleanup mode, NATS subscriptions +- `nats` - Optional NATS connection for receiving metrics + +## Test JWT + +For testing with JWT auth enabled: +``` +eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw +``` diff --git a/cmd/cc-metric-store/main.go b/cmd/cc-metric-store/main.go index 9072ce2..160af7a 100644 --- a/cmd/cc-metric-store/main.go +++ b/cmd/cc-metric-store/main.go @@ -19,6 +19,7 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/nats" "github.com/ClusterCockpit/cc-lib/v2/runtime" + "github.com/ClusterCockpit/cc-metric-store/internal/api" "github.com/ClusterCockpit/cc-metric-store/internal/config" "github.com/google/gops/agent" ) @@ -43,14 +44,25 @@ func printVersion() { func runServer(ctx context.Context) error { var wg sync.WaitGroup - // Initialize metric store if configuration is provided - mscfg := ccconf.GetPackageConfig("metric-store") - if mscfg != nil { - metricstore.Init(mscfg, &wg) - } else { + mscfg := ccconf.GetPackageConfig("metrics") + if mscfg == nil { + return fmt.Errorf("missing metrics configuration") + } + config.InitMetrics(mscfg) + + mscfg = ccconf.GetPackageConfig("metric-store") + if mscfg == nil { return fmt.Errorf("missing metricstore configuration") } + metricstore.Init(mscfg, config.GetMetrics(), &wg) + + if config.Keys.BackendURL != "" { + ms := metricstore.GetMemoryStore() + ms.SetNodeProvider(api.NewBackendNodeProvider(config.Keys.BackendURL)) + cclog.Infof("Node provider configured with backend URL: %s", config.Keys.BackendURL) + } + // Initialize HTTP server srv, err := NewServer(version, commit, date) if err != nil { diff --git a/go.mod b/go.mod index 8580571..166d42b 100644 --- a/go.mod +++ b/go.mod @@ -3,8 +3,8 @@ module github.com/ClusterCockpit/cc-metric-store go 1.24.0 require ( - github.com/ClusterCockpit/cc-backend v1.4.4-0.20260126082752-084d00cb0d0c - github.com/ClusterCockpit/cc-lib/v2 v2.2.0 + github.com/ClusterCockpit/cc-backend v1.4.4-0.20260128102127-0d857b49a256 + github.com/ClusterCockpit/cc-lib/v2 v2.2.1 github.com/golang-jwt/jwt/v4 v4.5.0 github.com/google/gops v0.3.28 github.com/influxdata/line-protocol/v2 v2.2.1 @@ -34,7 +34,6 @@ require ( github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 // indirect github.com/aws/smithy-go v1.24.0 // indirect - github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-openapi/jsonpointer v0.22.3 // indirect github.com/go-openapi/jsonreference v0.21.3 // indirect @@ -53,19 +52,13 @@ require ( github.com/nats-io/nats.go v1.47.0 // indirect github.com/nats-io/nkeys v0.4.12 // indirect github.com/nats-io/nuid v1.0.1 // indirect - github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/swaggo/files v1.0.1 // indirect - github.com/urfave/cli/v2 v2.27.7 // indirect - github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect - go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.46.0 // indirect golang.org/x/mod v0.31.0 // indirect golang.org/x/net v0.48.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/sys v0.39.0 // indirect - golang.org/x/text v0.32.0 // indirect golang.org/x/tools v0.40.0 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect - sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 6e598b3..cd814bb 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ -github.com/ClusterCockpit/cc-backend v1.4.4-0.20260126082752-084d00cb0d0c h1:rN1M3afMjlW4GUsa5jiR5OKA23IVpoeMrkbVlpk2sWw= -github.com/ClusterCockpit/cc-backend v1.4.4-0.20260126082752-084d00cb0d0c/go.mod h1:RDlfymO/WgrcZ1eDxGpur2jTEFoMA8BfJUvV+Heb+E4= -github.com/ClusterCockpit/cc-lib/v2 v2.2.0 h1:gqMsh7zsJMUhaXviXzaZ3gqXcLVgerjRJHzIcwX4FmQ= -github.com/ClusterCockpit/cc-lib/v2 v2.2.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= +github.com/ClusterCockpit/cc-backend v1.4.4-0.20260128102127-0d857b49a256 h1:PL8UhUBe+G6j1JoXybx27eKhgVgQ+Z0fQnvbVD3OmGA= +github.com/ClusterCockpit/cc-backend v1.4.4-0.20260128102127-0d857b49a256/go.mod h1:y5LuqfWrSnVYjMaxseBwq72Tx4NpyQWwTHXwKsYAPUk= +github.com/ClusterCockpit/cc-lib/v2 v2.2.1 h1:iCVas+Jc61zFH5S2VG3H1sc7tsn+U4lOJwUYjYZEims= +github.com/ClusterCockpit/cc-lib/v2 v2.2.1/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= @@ -50,8 +50,6 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= -github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -155,8 +153,6 @@ github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzM github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= -github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= -github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -171,10 +167,6 @@ github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64 github.com/swaggo/http-swagger v1.3.4/go.mod h1:9dAh0unqMBAlbp1uE2Uc2mQTxNMU/ha4UbucIg1MFkQ= github.com/swaggo/swag v1.16.6 h1:qBNcx53ZaX+M5dxVyTrgQ0PJ/ACK+NzhwcbieTt+9yI= github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4Xesg= -github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU= -github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4= -github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg= -github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= @@ -214,8 +206,6 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= -golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= @@ -233,5 +223,3 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= -sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/internal/api/nodeprovider.go b/internal/api/nodeprovider.go new file mode 100644 index 0000000..2f57f3f --- /dev/null +++ b/internal/api/nodeprovider.go @@ -0,0 +1,54 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-metric-store. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package api + +import ( + "encoding/json" + "fmt" + "net/http" + "time" +) + +// BackendNodeProvider implements metricstore.NodeProvider by querying +// the cc-backend /api/jobs/used_nodes endpoint. +type BackendNodeProvider struct { + backendUrl string + client *http.Client +} + +// NewBackendNodeProvider creates a new BackendNodeProvider that queries +// the given cc-backend URL for used nodes information. +func NewBackendNodeProvider(backendUrl string) *BackendNodeProvider { + return &BackendNodeProvider{ + backendUrl: backendUrl, + client: &http.Client{ + Timeout: 10 * time.Second, + }, + } +} + +// GetUsedNodes returns a map of cluster names to sorted lists of unique hostnames +// that are currently in use by jobs that started before the given timestamp. +func (p *BackendNodeProvider) GetUsedNodes(ts int64) (map[string][]string, error) { + url := fmt.Sprintf("%s/api/jobs/used_nodes?ts=%d", p.backendUrl, ts) + + resp, err := p.client.Get(url) + if err != nil { + return nil, fmt.Errorf("querying used nodes from backend: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("backend returned status %d", resp.StatusCode) + } + + var result map[string][]string + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, fmt.Errorf("decoding used nodes response: %w", err) + } + + return result, nil +} diff --git a/internal/config/config.go b/internal/config/config.go index e6ea240..4eb43fd 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -10,57 +10,20 @@ import ( "encoding/json" "fmt" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) -// For aggregation over multiple values at different cpus/sockets/..., not time! -type AggregationStrategy int - -const ( - NoAggregation AggregationStrategy = iota - SumAggregation - AvgAggregation -) - -func (as *AggregationStrategy) UnmarshalJSON(data []byte) error { - var str string - if err := json.Unmarshal(data, &str); err != nil { - return err - } - - switch str { - case "": - *as = NoAggregation - case "sum": - *as = SumAggregation - case "avg": - *as = AvgAggregation - default: - return fmt.Errorf("invalid aggregation strategy: %#v", str) - } - return nil -} - -type MetricConfig struct { - // Interval in seconds at which measurements will arive. - Frequency int64 `json:"frequency"` - - // Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy. - Aggregation AggregationStrategy `json:"aggregation"` - - // Private, used internally... - Offset int -} - -var metrics map[string]MetricConfig +var metrics map[string]metricstore.MetricConfig type Config struct { - Address string `json:"addr"` - CertFile string `json:"https-cert-file"` - KeyFile string `json:"https-key-file"` - User string `json:"user"` - Group string `json:"group"` - Debug struct { + Address string `json:"addr"` + CertFile string `json:"https-cert-file"` + KeyFile string `json:"https-key-file"` + User string `json:"user"` + Group string `json:"group"` + BackendURL string `json:"backend-url"` + Debug struct { DumpToFile string `json:"dump-to-file"` EnableGops bool `json:"gops"` } `json:"debug"` @@ -69,13 +32,32 @@ type Config struct { var Keys Config +type metricConfigJSON struct { + Frequency int64 `json:"frequency"` + Aggregation string `json:"aggregation"` +} + func InitMetrics(metricConfig json.RawMessage) { Validate(metricConfigSchema, metricConfig) + + var tempMetrics map[string]metricConfigJSON dec := json.NewDecoder(bytes.NewReader(metricConfig)) dec.DisallowUnknownFields() - if err := dec.Decode(&metrics); err != nil { + if err := dec.Decode(&tempMetrics); err != nil { cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", metricConfig, err.Error()) } + + metrics = make(map[string]metricstore.MetricConfig) + for name, cfg := range tempMetrics { + agg, err := metricstore.AssignAggregationStrategy(cfg.Aggregation) + if err != nil { + cclog.Warnf("Could not parse aggregation strategy for metric '%s': %s", name, err.Error()) + } + metrics[name] = metricstore.MetricConfig{ + Frequency: cfg.Frequency, + Aggregation: agg, + } + } } func Init(mainConfig json.RawMessage) { @@ -93,3 +75,7 @@ func GetMetricFrequency(metricName string) (int64, error) { } return 0, fmt.Errorf("metric %s not found", metricName) } + +func GetMetrics() map[string]metricstore.MetricConfig { + return metrics +} diff --git a/internal/config/schema.go b/internal/config/schema.go index 29e3723..514622a 100644 --- a/internal/config/schema.go +++ b/internal/config/schema.go @@ -29,6 +29,10 @@ var configSchema = ` "description": "Drop root permissions once the port was taken. Only applicable if using privileged port.", "type": "string" }, + "backend-url": { + "description": "URL of cc-backend for querying job information (e.g., 'https://localhost:8080').", + "type": "string" + }, "debug": { "description": "Debug options.", "type": "object",