Add configuration options and enable dynamic memory management through cc-backend callback

This commit is contained in:
2026-01-28 12:59:05 +01:00
parent 28f5ffe9c4
commit dd23f49364
7 changed files with 191 additions and 77 deletions

77
CLAUDE.md Normal file
View File

@@ -0,0 +1,77 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
cc-metric-store is an in-memory time-series database for HPC cluster metrics, part of the ClusterCockpit monitoring suite. Data is indexed by a hierarchical tree (cluster → host → socket/cpu/gpu) and accessed via selectors. The core storage engine lives in `cc-backend/pkg/metricstore`; this repo provides the HTTP API wrapper.
## Build Commands
```bash
make # Build binary, copy config template, create checkpoint dirs
make clean # Clean build cache and binary
make distclean # Also remove ./var and config.json
make swagger # Regenerate Swagger from source comments
make test # Run go build, go vet, go test
```
## Testing
```bash
go test -v ./... # Run tests
go test -bench=. -race -v ./... # With benchmarks and race detector
```
Integration test scripts in `/endpoint-test-scripts/` for manual API testing.
## Running
```bash
./cc-metric-store # Uses ./config.json
./cc-metric-store -config /path/to/config.json
./cc-metric-store -dev # Enable Swagger UI at /swagger/
./cc-metric-store -loglevel debug # debug|info|warn|err|crit
```
## Architecture
**Entry point:** `cmd/cc-metric-store/main.go`
- `run()` → parse flags, init logging/config, connect NATS
- `runServer()` → init metricstore from cc-backend, start HTTP server
**Key packages:**
- `internal/api/` - REST endpoints (query, write, free, debug, healthcheck) and JWT auth (Ed25519)
- `internal/config/` - Config loading and JSON schema validation
- External: `cc-backend/pkg/metricstore` - actual time-series storage engine
**API endpoints** (all support optional JWT auth):
- `GET /api/query/` - Query metrics with selectors
- `POST /api/write/` - Write metrics (InfluxDB line protocol)
- `POST /api/free/` - Free buffers up to timestamp
- `GET /api/debug/` - Dump internal state
- `GET /api/healthcheck/` - Node health status
## Selectors
Data is accessed via hierarchical selectors:
```
["cluster1", "host1", "cpu0"] # Specific CPU
["cluster1", "host1", ["cpu4", "cpu5"]] # Multiple CPUs
["cluster1", "host1"] # Entire node (all CPUs implied)
```
## Configuration
Config file structure (see `configs/config.json`):
- `main` - Server address, TLS certs, JWT public key, user/group for privilege drop
- `metrics` - Per-metric frequency and aggregation strategy (sum/avg/null)
- `metric-store` - Checkpoints, memory cap, retention, cleanup mode, NATS subscriptions
- `nats` - Optional NATS connection for receiving metrics
## Test JWT
For testing with JWT auth enabled:
```
eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw
```

View File

@@ -19,6 +19,7 @@ import (
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/nats"
"github.com/ClusterCockpit/cc-lib/v2/runtime"
"github.com/ClusterCockpit/cc-metric-store/internal/api"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
"github.com/google/gops/agent"
)
@@ -43,14 +44,25 @@ func printVersion() {
func runServer(ctx context.Context) error {
var wg sync.WaitGroup
// Initialize metric store if configuration is provided
mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg != nil {
metricstore.Init(mscfg, &wg)
} else {
mscfg := ccconf.GetPackageConfig("metrics")
if mscfg == nil {
return fmt.Errorf("missing metrics configuration")
}
config.InitMetrics(mscfg)
mscfg = ccconf.GetPackageConfig("metric-store")
if mscfg == nil {
return fmt.Errorf("missing metricstore configuration")
}
metricstore.Init(mscfg, config.GetMetrics(), &wg)
if config.Keys.BackendURL != "" {
ms := metricstore.GetMemoryStore()
ms.SetNodeProvider(api.NewBackendNodeProvider(config.Keys.BackendURL))
cclog.Infof("Node provider configured with backend URL: %s", config.Keys.BackendURL)
}
// Initialize HTTP server
srv, err := NewServer(version, commit, date)
if err != nil {

11
go.mod
View File

@@ -3,8 +3,8 @@ module github.com/ClusterCockpit/cc-metric-store
go 1.24.0
require (
github.com/ClusterCockpit/cc-backend v1.4.4-0.20260126082752-084d00cb0d0c
github.com/ClusterCockpit/cc-lib/v2 v2.2.0
github.com/ClusterCockpit/cc-backend v1.4.4-0.20260128102127-0d857b49a256
github.com/ClusterCockpit/cc-lib/v2 v2.2.1
github.com/golang-jwt/jwt/v4 v4.5.0
github.com/google/gops v0.3.28
github.com/influxdata/line-protocol/v2 v2.2.1
@@ -34,7 +34,6 @@ require (
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 // indirect
github.com/aws/smithy-go v1.24.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
github.com/fsnotify/fsnotify v1.9.0 // indirect
github.com/go-openapi/jsonpointer v0.22.3 // indirect
github.com/go-openapi/jsonreference v0.21.3 // indirect
@@ -53,19 +52,13 @@ require (
github.com/nats-io/nats.go v1.47.0 // indirect
github.com/nats-io/nkeys v0.4.12 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/swaggo/files v1.0.1 // indirect
github.com/urfave/cli/v2 v2.27.7 // indirect
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/crypto v0.46.0 // indirect
golang.org/x/mod v0.31.0 // indirect
golang.org/x/net v0.48.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.39.0 // indirect
golang.org/x/text v0.32.0 // indirect
golang.org/x/tools v0.40.0 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)

20
go.sum
View File

@@ -1,7 +1,7 @@
github.com/ClusterCockpit/cc-backend v1.4.4-0.20260126082752-084d00cb0d0c h1:rN1M3afMjlW4GUsa5jiR5OKA23IVpoeMrkbVlpk2sWw=
github.com/ClusterCockpit/cc-backend v1.4.4-0.20260126082752-084d00cb0d0c/go.mod h1:RDlfymO/WgrcZ1eDxGpur2jTEFoMA8BfJUvV+Heb+E4=
github.com/ClusterCockpit/cc-lib/v2 v2.2.0 h1:gqMsh7zsJMUhaXviXzaZ3gqXcLVgerjRJHzIcwX4FmQ=
github.com/ClusterCockpit/cc-lib/v2 v2.2.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
github.com/ClusterCockpit/cc-backend v1.4.4-0.20260128102127-0d857b49a256 h1:PL8UhUBe+G6j1JoXybx27eKhgVgQ+Z0fQnvbVD3OmGA=
github.com/ClusterCockpit/cc-backend v1.4.4-0.20260128102127-0d857b49a256/go.mod h1:y5LuqfWrSnVYjMaxseBwq72Tx4NpyQWwTHXwKsYAPUk=
github.com/ClusterCockpit/cc-lib/v2 v2.2.1 h1:iCVas+Jc61zFH5S2VG3H1sc7tsn+U4lOJwUYjYZEims=
github.com/ClusterCockpit/cc-lib/v2 v2.2.1/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
@@ -50,8 +50,6 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -155,8 +153,6 @@ github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzM
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@@ -171,10 +167,6 @@ github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64
github.com/swaggo/http-swagger v1.3.4/go.mod h1:9dAh0unqMBAlbp1uE2Uc2mQTxNMU/ha4UbucIg1MFkQ=
github.com/swaggo/swag v1.16.6 h1:qBNcx53ZaX+M5dxVyTrgQ0PJ/ACK+NzhwcbieTt+9yI=
github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4Xesg=
github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU=
github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4=
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg=
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
@@ -214,8 +206,6 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
@@ -233,5 +223,3 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=

View File

@@ -0,0 +1,54 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-metric-store.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"encoding/json"
"fmt"
"net/http"
"time"
)
// BackendNodeProvider implements metricstore.NodeProvider by querying
// the cc-backend /api/jobs/used_nodes endpoint.
type BackendNodeProvider struct {
backendUrl string
client *http.Client
}
// NewBackendNodeProvider creates a new BackendNodeProvider that queries
// the given cc-backend URL for used nodes information.
func NewBackendNodeProvider(backendUrl string) *BackendNodeProvider {
return &BackendNodeProvider{
backendUrl: backendUrl,
client: &http.Client{
Timeout: 10 * time.Second,
},
}
}
// GetUsedNodes returns a map of cluster names to sorted lists of unique hostnames
// that are currently in use by jobs that started before the given timestamp.
func (p *BackendNodeProvider) GetUsedNodes(ts int64) (map[string][]string, error) {
url := fmt.Sprintf("%s/api/jobs/used_nodes?ts=%d", p.backendUrl, ts)
resp, err := p.client.Get(url)
if err != nil {
return nil, fmt.Errorf("querying used nodes from backend: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("backend returned status %d", resp.StatusCode)
}
var result map[string][]string
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, fmt.Errorf("decoding used nodes response: %w", err)
}
return result, nil
}

View File

@@ -10,57 +10,20 @@ import (
"encoding/json"
"fmt"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
// For aggregation over multiple values at different cpus/sockets/..., not time!
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota
SumAggregation
AvgAggregation
)
func (as *AggregationStrategy) UnmarshalJSON(data []byte) error {
var str string
if err := json.Unmarshal(data, &str); err != nil {
return err
}
switch str {
case "":
*as = NoAggregation
case "sum":
*as = SumAggregation
case "avg":
*as = AvgAggregation
default:
return fmt.Errorf("invalid aggregation strategy: %#v", str)
}
return nil
}
type MetricConfig struct {
// Interval in seconds at which measurements will arive.
Frequency int64 `json:"frequency"`
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy `json:"aggregation"`
// Private, used internally...
Offset int
}
var metrics map[string]MetricConfig
var metrics map[string]metricstore.MetricConfig
type Config struct {
Address string `json:"addr"`
CertFile string `json:"https-cert-file"`
KeyFile string `json:"https-key-file"`
User string `json:"user"`
Group string `json:"group"`
Debug struct {
Address string `json:"addr"`
CertFile string `json:"https-cert-file"`
KeyFile string `json:"https-key-file"`
User string `json:"user"`
Group string `json:"group"`
BackendURL string `json:"backend-url"`
Debug struct {
DumpToFile string `json:"dump-to-file"`
EnableGops bool `json:"gops"`
} `json:"debug"`
@@ -69,13 +32,32 @@ type Config struct {
var Keys Config
type metricConfigJSON struct {
Frequency int64 `json:"frequency"`
Aggregation string `json:"aggregation"`
}
func InitMetrics(metricConfig json.RawMessage) {
Validate(metricConfigSchema, metricConfig)
var tempMetrics map[string]metricConfigJSON
dec := json.NewDecoder(bytes.NewReader(metricConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&metrics); err != nil {
if err := dec.Decode(&tempMetrics); err != nil {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", metricConfig, err.Error())
}
metrics = make(map[string]metricstore.MetricConfig)
for name, cfg := range tempMetrics {
agg, err := metricstore.AssignAggregationStrategy(cfg.Aggregation)
if err != nil {
cclog.Warnf("Could not parse aggregation strategy for metric '%s': %s", name, err.Error())
}
metrics[name] = metricstore.MetricConfig{
Frequency: cfg.Frequency,
Aggregation: agg,
}
}
}
func Init(mainConfig json.RawMessage) {
@@ -93,3 +75,7 @@ func GetMetricFrequency(metricName string) (int64, error) {
}
return 0, fmt.Errorf("metric %s not found", metricName)
}
func GetMetrics() map[string]metricstore.MetricConfig {
return metrics
}

View File

@@ -29,6 +29,10 @@ var configSchema = `
"description": "Drop root permissions once the port was taken. Only applicable if using privileged port.",
"type": "string"
},
"backend-url": {
"description": "URL of cc-backend for querying job information (e.g., 'https://localhost:8080').",
"type": "string"
},
"debug": {
"description": "Debug options.",
"type": "object",