98 Commits

Author SHA1 Message Date
Christoph Kluge
55d2c7d7eb Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-16 09:32:05 +01:00
Christoph Kluge
bb527fb410 bump frontend dependencies 2026-01-16 09:32:02 +01:00
9a97d0e8eb Add documentation 2026-01-16 08:27:46 +01:00
93dcfee8c5 Document and refactor 2026-01-16 08:27:15 +01:00
76139ef53c Remove now optional apiAllowedIPs option 2026-01-16 08:23:31 +01:00
Aditya Ujeniya
32319adf72 Add Memory Tracker worker for CCMS 2026-01-15 21:29:21 +01:00
Aditya Ujeniya
10a5c89a16 Fix logic for findFiles() and keep archive worker 2026-01-15 20:27:11 +01:00
Christoph Kluge
40bff1eff9 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-15 18:18:54 +01:00
Christoph Kluge
ceba4eb0c6 Review dashboards, add twosided progress to indernal dash 2026-01-15 18:18:51 +01:00
Christoph Kluge
faacf3f343 svelte state_referenced_locally warning fixes
- change to derived where possible
- suppress warning elsewhere
- discussion here: sveltejs/svelte/issues/17289
2026-01-15 18:17:45 +01:00
Aditya Ujeniya
7cd98c4f25 Test and update files for dynamic retention 2026-01-15 17:48:59 +01:00
Michael Panzlaff
489ad44b9f Make apiAllowedIPs optional
If our test and production instance just use *, one might as well make
that the default value. This should ease configuration for minimal
setups.
2026-01-15 16:08:29 +01:00
7db2bbe6b0 Add job tagger option to example config 2026-01-15 15:53:54 +01:00
b6f0faa97f Make polarplot default in Jobview 2026-01-15 15:47:40 +01:00
a3fffa8e8b Update example and demo config 2026-01-15 13:57:15 +01:00
72248defbf Cleanup print statements. Always enable Compression 2026-01-15 13:39:22 +01:00
155e05495e Fix shutdown timout bug 2026-01-15 13:29:19 +01:00
9c92a7796b Introduce nodeprovider interface to break import cycle 2026-01-15 12:20:11 +01:00
7c78407c49 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-15 11:34:09 +01:00
cb219b3c74 Fix configuration issues. Fix shutdown hangs
Always turn on compression
2026-01-15 11:34:06 +01:00
d59aa2e855 Restructure configuration with sensible defaults. Fix shutdown hangs 2026-01-15 11:33:01 +01:00
Christoph Kluge
cd3d133f0d Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-15 10:12:02 +01:00
Christoph Kluge
3b7fc44ce9 specify job count return logs, move to debug level 2026-01-15 10:11:31 +01:00
e1efc68476 Update dependencies. Rebuild graphql and swagger 2026-01-15 08:32:06 +01:00
8f0bb907ff Improve documentation and add more tests 2026-01-15 06:41:23 +01:00
Christoph Kluge
e5c620ca20 filter metrics for NodeMetrics query 2026-01-14 19:22:31 +01:00
Christoph Kluge
d0bcfb90e6 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-14 17:18:25 +01:00
Christoph Kluge
9deee54e41 make nextPage query conditional if no return limit is requested 2026-01-14 17:18:21 +01:00
Michael Panzlaff
94b86ef11a Mismatched event types are not something to be concerned about
If a different CCMessage type was sent over the same subject as
requested, that shouldn't raise a warning. This may happen in production
instances, but in order to ease debugging, lower it to 'debug' level.
2026-01-14 16:08:33 +01:00
Christoph Kluge
d8cd752dcb Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-14 16:03:39 +01:00
Christoph Kluge
5d376e6865 fix clustername array init size 2026-01-14 16:03:34 +01:00
9c3beddf54 Improve documentation 2026-01-14 15:39:38 +01:00
c6465ad9e5 Add s3 configuration options 2026-01-14 15:28:34 +01:00
d415381d4a Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-14 15:10:25 +01:00
211d4fae54 Refactor s3 backend and suppress checksum warning 2026-01-14 15:10:20 +01:00
Aditya Ujeniya
3276ed7785 Half-baked commit for new dynamic retention logic 2026-01-14 14:56:36 +01:00
Christoph Kluge
77b7548ef3 fix wrong varname 2026-01-14 13:00:36 +01:00
Christoph Kluge
59851f410e Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-14 11:25:43 +01:00
Christoph Kluge
4cb8d648cb adapt frontend to backend config changes, clarify variable names 2026-01-14 11:25:40 +01:00
c8627a13f4 Remove obsolete slusters config section 2026-01-14 11:17:49 +01:00
0ea0270fe1 Reintroduce Clusters as string list of cluster names 2026-01-14 10:37:07 +01:00
19402d30af Review and improve error messages and doc comments 2026-01-14 10:09:19 +01:00
b2f870e3c0 Convert nodestate nats API to influx line protocol payload. Review and add doc comments.
Improve and extend tests
2026-01-14 10:08:06 +01:00
9e542dc200 Review and improve, add documentation 2026-01-14 09:26:03 +01:00
6cf59043a3 Review and improve, add documentation 2026-01-14 08:59:27 +01:00
71b75eea0e Improve GetUsedNodes function 2026-01-14 08:49:55 +01:00
e900a686db Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-14 07:38:04 +01:00
fb8db3c3ae Add query which node metric data needs to be retained 2026-01-14 07:37:31 +01:00
Christoph Kluge
170a9ace8a Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-13 16:59:57 +01:00
Christoph Kluge
518e9950ea add job exclusivity filter, review db indices 2026-01-13 16:59:52 +01:00
25c8fca561 Revert retention config in metricstore 2026-01-13 14:42:24 +01:00
754f7e16f6 Reformat with gofumpt 2026-01-13 09:52:31 +01:00
04a2e460ae Refactor metricstore. Initial stub for cluster/ subcluster specific retention times 2026-01-13 09:52:00 +01:00
2ebab1e2e2 Reformat with gofumpt 2026-01-13 09:50:57 +01:00
a9366d14c6 Add README for tagging. Enable tagging by flag without configuration option 2026-01-13 08:32:32 +01:00
42809e3f75 Remove embedded tagger rules 2026-01-13 07:20:26 +01:00
4cec933349 Remove obsolete cluster config section 2026-01-13 06:28:33 +01:00
d3f3c532b1 Merge branch 'master' into dev 2026-01-12 11:18:56 +01:00
ad1e87d0b8 Disable dependabot alerts 2026-01-12 11:17:44 +01:00
Jan Eitzinger
affa85c086 Merge pull request #469 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2/credentials-1.19.7
Bump github.com/aws/aws-sdk-go-v2/credentials from 1.19.6 to 1.19.7
2026-01-12 10:30:35 +01:00
Jan Eitzinger
aa053d78f7 Merge pull request #470 from ClusterCockpit/dependabot/go_modules/github.com/mattn/go-sqlite3-1.14.33
Bump github.com/mattn/go-sqlite3 from 1.14.32 to 1.14.33
2026-01-12 10:30:16 +01:00
dependabot[bot]
fae6d9d835 Bump github.com/mattn/go-sqlite3 from 1.14.32 to 1.14.33
Bumps [github.com/mattn/go-sqlite3](https://github.com/mattn/go-sqlite3) from 1.14.32 to 1.14.33.
- [Release notes](https://github.com/mattn/go-sqlite3/releases)
- [Commits](https://github.com/mattn/go-sqlite3/compare/v1.14.32...v1.14.33)

---
updated-dependencies:
- dependency-name: github.com/mattn/go-sqlite3
  dependency-version: 1.14.33
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-12 08:52:44 +00:00
dependabot[bot]
78f1db7ad1 Bump github.com/aws/aws-sdk-go-v2/credentials from 1.19.6 to 1.19.7
Bumps [github.com/aws/aws-sdk-go-v2/credentials](https://github.com/aws/aws-sdk-go-v2) from 1.19.6 to 1.19.7.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/service/m2/v1.19.6...service/m2/v1.19.7)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2/credentials
  dependency-version: 1.19.7
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-12 08:52:40 +00:00
f1367f84f8 Merge branch 'master' into dev 2026-01-12 09:14:45 +01:00
Jan Eitzinger
4c81696f4d Merge pull request #455 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/rollup-4.54.0
Bump rollup from 4.53.3 to 4.54.0 in /web/frontend
2026-01-12 09:09:42 +01:00
Jan Eitzinger
a91f8f72e3 Merge pull request #459 from ClusterCockpit/dependabot/go_modules/golang.org/x/oauth2-0.34.0
Bump golang.org/x/oauth2 from 0.32.0 to 0.34.0
2026-01-12 09:09:18 +01:00
Jan Eitzinger
87f7ed329c Merge pull request #461 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/svelte-5.46.1
Bump svelte from 5.44.0 to 5.46.1 in /web/frontend
2026-01-12 09:08:49 +01:00
dependabot[bot]
8641d9053d Bump golang.org/x/oauth2 from 0.32.0 to 0.34.0
Bumps [golang.org/x/oauth2](https://github.com/golang/oauth2) from 0.32.0 to 0.34.0.
- [Commits](https://github.com/golang/oauth2/compare/v0.32.0...v0.34.0)

---
updated-dependencies:
- dependency-name: golang.org/x/oauth2
  dependency-version: 0.34.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-12 08:07:20 +00:00
Jan Eitzinger
4a5ab8a279 Merge pull request #462 from ClusterCockpit/dependabot/go_modules/github.com/99designs/gqlgen-0.17.85
Bump github.com/99designs/gqlgen from 0.17.84 to 0.17.85
2026-01-12 09:06:18 +01:00
Jan Eitzinger
d179412ab6 Merge pull request #463 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2/service/s3-1.95.0
Bump github.com/aws/aws-sdk-go-v2/service/s3 from 1.90.2 to 1.95.0
2026-01-12 09:05:50 +01:00
Jan Eitzinger
968c7d179d Merge pull request #464 from ClusterCockpit/dependabot/go_modules/github.com/go-co-op/gocron/v2-2.19.0
Bump github.com/go-co-op/gocron/v2 from 2.18.2 to 2.19.0
2026-01-12 09:05:29 +01:00
56399523d7 Update module deps 2026-01-12 09:00:06 +01:00
4d6326b8be Remove metricsync 2026-01-12 08:55:31 +01:00
dependabot[bot]
a2414791bf Bump github.com/go-co-op/gocron/v2 from 2.18.2 to 2.19.0
Bumps [github.com/go-co-op/gocron/v2](https://github.com/go-co-op/gocron) from 2.18.2 to 2.19.0.
- [Release notes](https://github.com/go-co-op/gocron/releases)
- [Commits](https://github.com/go-co-op/gocron/compare/v2.18.2...v2.19.0)

---
updated-dependencies:
- dependency-name: github.com/go-co-op/gocron/v2
  dependency-version: 2.19.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:05:04 +00:00
dependabot[bot]
faf3a19f0c Bump github.com/aws/aws-sdk-go-v2/service/s3 from 1.90.2 to 1.95.0
Bumps [github.com/aws/aws-sdk-go-v2/service/s3](https://github.com/aws/aws-sdk-go-v2) from 1.90.2 to 1.95.0.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/service/s3/v1.90.2...service/s3/v1.95.0)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2/service/s3
  dependency-version: 1.95.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:04:58 +00:00
dependabot[bot]
4e6038d6c1 Bump github.com/99designs/gqlgen from 0.17.84 to 0.17.85
Bumps [github.com/99designs/gqlgen](https://github.com/99designs/gqlgen) from 0.17.84 to 0.17.85.
- [Release notes](https://github.com/99designs/gqlgen/releases)
- [Changelog](https://github.com/99designs/gqlgen/blob/master/CHANGELOG.md)
- [Commits](https://github.com/99designs/gqlgen/compare/v0.17.84...v0.17.85)

---
updated-dependencies:
- dependency-name: github.com/99designs/gqlgen
  dependency-version: 0.17.85
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:03:41 +00:00
dependabot[bot]
ddc2ecf829 Bump svelte from 5.44.0 to 5.46.1 in /web/frontend
Bumps [svelte](https://github.com/sveltejs/svelte/tree/HEAD/packages/svelte) from 5.44.0 to 5.46.1.
- [Release notes](https://github.com/sveltejs/svelte/releases)
- [Changelog](https://github.com/sveltejs/svelte/blob/main/packages/svelte/CHANGELOG.md)
- [Commits](https://github.com/sveltejs/svelte/commits/svelte@5.46.1/packages/svelte)

---
updated-dependencies:
- dependency-name: svelte
  dependency-version: 5.46.1
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:02:46 +00:00
ecb5aef735 Fix build error in unit test 2025-12-25 08:48:03 +01:00
11ec2267da Major refactor of metric data handling
- make the  internal memory store required and default
- Rename memorystore to metricstore
- Rename metricDataDispatcher to metricdispatch
- Remove metricdata package
- Introduce metricsync package for upstream metric data pull
2025-12-25 08:42:54 +01:00
8576ae458d Switch to cc-lib v2 2025-12-24 09:24:18 +01:00
Jan Eitzinger
c66445acb5 Merge pull request #458 from ClusterCockpit/dependabot/go_modules/github.com/expr-lang/expr-1.17.7
Bump github.com/expr-lang/expr from 1.17.6 to 1.17.7
2025-12-23 21:16:13 +01:00
dependabot[bot]
29a20f7b0b Bump github.com/expr-lang/expr from 1.17.6 to 1.17.7
Bumps [github.com/expr-lang/expr](https://github.com/expr-lang/expr) from 1.17.6 to 1.17.7.
- [Release notes](https://github.com/expr-lang/expr/releases)
- [Commits](https://github.com/expr-lang/expr/compare/v1.17.6...v1.17.7)

---
updated-dependencies:
- dependency-name: github.com/expr-lang/expr
  dependency-version: 1.17.7
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-23 09:07:01 +00:00
Jan Eitzinger
874c019fb6 Merge pull request #457 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2/config-1.32.6
Bump github.com/aws/aws-sdk-go-v2/config from 1.31.20 to 1.32.6
2025-12-23 10:06:17 +01:00
Jan Eitzinger
54825626de Merge pull request #456 from ClusterCockpit/dependabot/go_modules/github.com/coreos/go-oidc/v3-3.17.0
Bump github.com/coreos/go-oidc/v3 from 3.16.0 to 3.17.0
2025-12-23 10:05:56 +01:00
9bf5c5dc1a Update README and config schema 2025-12-23 09:34:09 +01:00
64fef9774c Add unit test for NATS API 2025-12-23 09:22:57 +01:00
999667ec0c Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-23 07:56:16 +01:00
c1135531ba Port NATS api to ccMessages 2025-12-23 07:56:13 +01:00
287256e5f1 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-23 05:56:49 +01:00
0bc26aa194 Add error check 2025-12-23 05:56:46 +01:00
Christoph Kluge
502d7e9084 Rework info panel in public dashboard
- change to bootstrap grid from table
- add infos, use badges
- remove non required query
2025-12-22 17:26:56 +01:00
Christoph Kluge
89875db4a9 dashboard layout fixes 2025-12-22 10:39:40 +01:00
dependabot[bot]
5a8b929448 Bump github.com/aws/aws-sdk-go-v2/config from 1.31.20 to 1.32.6
Bumps [github.com/aws/aws-sdk-go-v2/config](https://github.com/aws/aws-sdk-go-v2) from 1.31.20 to 1.32.6.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/config/v1.31.20...v1.32.6)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2/config
  dependency-version: 1.32.6
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-22 08:04:43 +00:00
dependabot[bot]
fe78f2f433 Bump github.com/coreos/go-oidc/v3 from 3.16.0 to 3.17.0
Bumps [github.com/coreos/go-oidc/v3](https://github.com/coreos/go-oidc) from 3.16.0 to 3.17.0.
- [Release notes](https://github.com/coreos/go-oidc/releases)
- [Commits](https://github.com/coreos/go-oidc/compare/v3.16.0...v3.17.0)

---
updated-dependencies:
- dependency-name: github.com/coreos/go-oidc/v3
  dependency-version: 3.17.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-22 08:03:31 +00:00
dependabot[bot]
e37591ce6d Bump rollup from 4.53.3 to 4.54.0 in /web/frontend
Bumps [rollup](https://github.com/rollup/rollup) from 4.53.3 to 4.54.0.
- [Release notes](https://github.com/rollup/rollup/releases)
- [Changelog](https://github.com/rollup/rollup/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rollup/rollup/compare/v4.53.3...v4.54.0)

---
updated-dependencies:
- dependency-name: rollup
  dependency-version: 4.54.0
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-22 08:02:41 +00:00
Jan Eitzinger
998f800632 Merge pull request #454 from ClusterCockpit/dev
Dev
2025-12-18 15:52:06 +01:00
Jan Eitzinger
10a0b0add8 Merge pull request #452 from ClusterCockpit/dev
Dev
2025-12-18 11:28:07 +01:00
Jan Eitzinger
f9aa47ea1c Merge pull request #450 from ClusterCockpit/dev
Dev
2025-12-15 14:42:26 +01:00
213 changed files with 11807 additions and 5891 deletions

View File

@@ -1,15 +0,0 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
version: 2
updates:
- package-ecosystem: "gomod"
directory: "/"
schedule:
interval: "weekly"
- package-ecosystem: "npm"
directory: "/web/frontend"
schedule:
interval: "weekly"

104
CLAUDE.md
View File

@@ -96,15 +96,19 @@ The backend follows a layered architecture with clear separation of concerns:
- **internal/auth**: Authentication layer
- Supports local accounts, LDAP, OIDC, and JWT tokens
- Implements rate limiting for login attempts
- **internal/metricdata**: Metric data repository abstraction
- Pluggable backends: cc-metric-store, Prometheus, InfluxDB
- Each cluster can have a different metric data backend
- **internal/metricstore**: Metric store with data loading API
- In-memory metric storage with checkpointing
- Query API for loading job metric data
- **internal/archiver**: Job archiving to file-based archive
- **internal/api/nats.go**: NATS-based API for job and node operations
- Subscribes to NATS subjects for job events (start/stop)
- Handles node state updates via NATS
- Uses InfluxDB line protocol message format
- **pkg/archive**: Job archive backend implementations
- File system backend (default)
- S3 backend
- SQLite backend (experimental)
- **pkg/nats**: NATS integration for metric ingestion
- **pkg/nats**: NATS client and message decoding utilities
### Frontend Structure
@@ -146,6 +150,14 @@ applied automatically on startup. Version tracking in `version` table.
## Configuration
- **config.json**: Main configuration (clusters, metric repositories, archive settings)
- `main.apiSubjects`: NATS subject configuration (optional)
- `subjectJobEvent`: Subject for job start/stop events (e.g., "cc.job.event")
- `subjectNodeState`: Subject for node state updates (e.g., "cc.node.state")
- `nats`: NATS client connection configuration (optional)
- `address`: NATS server address (e.g., "nats://localhost:4222")
- `username`: Authentication username (optional)
- `password`: Authentication password (optional)
- `creds-file-path`: Path to NATS credentials file (optional)
- **.env**: Environment variables (secrets like JWT keys)
- Copy from `configs/env-template.txt`
- NEVER commit this file
@@ -197,8 +209,8 @@ applied automatically on startup. Version tracking in `version` table.
### Adding a new metric data backend
1. Implement `MetricDataRepository` interface in `internal/metricdata/`
2. Register in `metricdata.Init()` switch statement
1. Implement metric loading functions in `internal/metricstore/query.go`
2. Add cluster configuration to metric store initialization
3. Update config.json schema documentation
### Modifying database schema
@@ -207,9 +219,87 @@ applied automatically on startup. Version tracking in `version` table.
2. Increment `repository.Version`
3. Test with fresh database and existing database
## NATS API
The backend supports a NATS-based API as an alternative to the REST API for job and node operations.
### Setup
1. Configure NATS client connection in `config.json`:
```json
{
"nats": {
"address": "nats://localhost:4222",
"username": "user",
"password": "pass"
}
}
```
2. Configure API subjects in `config.json` under `main`:
```json
{
"main": {
"apiSubjects": {
"subjectJobEvent": "cc.job.event",
"subjectNodeState": "cc.node.state"
}
}
}
```
### Message Format
Messages use **InfluxDB line protocol** format with the following structure:
#### Job Events
**Start Job:**
```
job,function=start_job event="{\"jobId\":123,\"user\":\"alice\",\"cluster\":\"test\", ...}" 1234567890000000000
```
**Stop Job:**
```
job,function=stop_job event="{\"jobId\":123,\"cluster\":\"test\",\"startTime\":1234567890,\"stopTime\":1234571490,\"jobState\":\"completed\"}" 1234571490000000000
```
**Tags:**
- `function`: Either `start_job` or `stop_job`
**Fields:**
- `event`: JSON payload containing job data (see REST API documentation for schema)
#### Node State Updates
```json
{
"cluster": "testcluster",
"nodes": [
{
"hostname": "node001",
"states": ["allocated"],
"cpusAllocated": 8,
"memoryAllocated": 16384,
"gpusAllocated": 0,
"jobsRunning": 1
}
]
}
```
### Implementation Notes
- NATS API mirrors REST API functionality but uses messaging
- Job start/stop events are processed asynchronously
- Duplicate job detection is handled (same as REST API)
- All validation rules from REST API apply
- Messages are logged; no responses are sent back to publishers
- If NATS client is unavailable, API subscriptions are skipped (logged as warning)
## Dependencies
- Go 1.24.0+ (check go.mod for exact version)
- Node.js (for frontend builds)
- SQLite 3 (only supported database)
- Optional: NATS server for metric ingestion
- Optional: NATS server for NATS API integration

View File

@@ -22,11 +22,12 @@ switching from PHP Symfony to a Golang based solution are explained
## Overview
This is a Golang web backend for the ClusterCockpit job-specific performance
monitoring framework. It provides a REST API for integrating ClusterCockpit with
an HPC cluster batch system and external analysis scripts. Data exchange between
the web front-end and the back-end is based on a GraphQL API. The web frontend
is also served by the backend using [Svelte](https://svelte.dev/) components.
Layout and styling are based on [Bootstrap 5](https://getbootstrap.com/) using
monitoring framework. It provides a REST API and an optional NATS-based messaging
API for integrating ClusterCockpit with an HPC cluster batch system and external
analysis scripts. Data exchange between the web front-end and the back-end is
based on a GraphQL API. The web frontend is also served by the backend using
[Svelte](https://svelte.dev/) components. Layout and styling are based on
[Bootstrap 5](https://getbootstrap.com/) using
[Bootstrap Icons](https://icons.getbootstrap.com/).
The backend uses [SQLite 3](https://sqlite.org/) as the relational SQL database.
@@ -35,6 +36,10 @@ databases, the only tested and supported setup is to use cc-metric-store as the
metric data backend. Documentation on how to integrate ClusterCockpit with other
time series databases will be added in the future.
For real-time integration with HPC systems, the backend can subscribe to
[NATS](https://nats.io/) subjects to receive job start/stop events and node
state updates, providing an alternative to REST API polling.
Completed batch jobs are stored in a file-based job archive according to
[this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
The backend supports authentication via local accounts, an external LDAP
@@ -130,27 +135,58 @@ ln -s <your-existing-job-archive> ./var/job-archive
## Project file structure
- [`.github/`](https://github.com/ClusterCockpit/cc-backend/tree/master/.github)
GitHub Actions workflows and dependabot configuration for CI/CD.
- [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
contains the API schema files for the REST and GraphQL APIs. The REST API is
documented in the OpenAPI 3.0 format in
[./api/openapi.yaml](./api/openapi.yaml).
[./api/swagger.yaml](./api/swagger.yaml). The GraphQL schema is in
[./api/schema.graphqls](./api/schema.graphqls).
- [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
contains `main.go` for the main application.
contains the main application entry point and CLI implementation.
- [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
contains documentation about configuration and command line options and required
environment variables. A sample configuration file is provided.
- [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs)
contains more in-depth documentation.
environment variables. Sample configuration files are provided.
- [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
contains an example of setting up systemd for production use.
- [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
contains library source code that is not intended for use by others.
- [`api`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/api)
REST API handlers and NATS integration
- [`archiver`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/archiver)
Job archiving functionality
- [`auth`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/auth)
Authentication (local, LDAP, OIDC) and JWT token handling
- [`config`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/config)
Configuration management and validation
- [`graph`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/graph)
GraphQL schema and resolvers
- [`importer`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/importer)
Job data import and database initialization
- [`metricstore`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricstore)
In-memory metric data store with checkpointing and metric loading
- [`metricdispatch`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricdispatch)
Dispatches metric data loading to appropriate backends
- [`repository`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/repository)
Database repository layer for jobs and metadata
- [`routerConfig`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/routerConfig)
HTTP router configuration and middleware
- [`tagger`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/tagger)
Job classification and application detection
- [`taskmanager`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/taskmanager)
Background task management and scheduled jobs
- [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
contains Go packages that can be used by other projects.
- [`archive`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/archive)
Job archive backend implementations (filesystem, S3)
- [`nats`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/nats)
NATS client and message handling
- [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
Additional command line helper tools.
- [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
Commands for getting infos about and existing job archive.
Commands for getting infos about an existing job archive.
- [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
Tool for migrating job archives between formats.
- [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
Tool to convert external pubkey for use in `cc-backend`.
- [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
@@ -162,7 +198,7 @@ ln -s <your-existing-job-archive> ./var/job-archive
- [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
Svelte components and static assets for the frontend UI
- [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
Server-side Go templates
Server-side Go templates, including monitoring views
- [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
Configures the behaviour and generation of
[gqlgen](https://github.com/99designs/gqlgen).

View File

@@ -458,6 +458,7 @@ input JobFilter {
state: [JobState!]
metricStats: [MetricStatItem!]
shared: String
schedule: String
node: StringInput
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -15,8 +15,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/util"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/util"
)
const envString = `
@@ -36,7 +36,7 @@ const configString = `
"short-running-jobs-duration": 300,
"resampling": {
"minimumPoints": 600,
"trigger": 180,
"trigger": 300,
"resolutions": [
240,
60
@@ -48,7 +48,7 @@ const configString = `
"emission-constant": 317
},
"cron": {
"commit-job-worker": "2m",
"commit-job-worker": "1m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
@@ -60,31 +60,7 @@ const configString = `
"jwts": {
"max-age": "2000h"
}
},
"clusters": [
{
"name": "name",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2023-01-01T00:00:00Z",
"to": null
}
}
}
]
}
}
`

View File

@@ -24,19 +24,18 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/tagger"
"github.com/ClusterCockpit/cc-backend/internal/taskmanager"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/nats"
"github.com/ClusterCockpit/cc-backend/web"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/runtimeEnv"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/runtime"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/google/gops/agent"
"github.com/joho/godotenv"
@@ -103,12 +102,7 @@ func initConfiguration() error {
return fmt.Errorf("main configuration must be present")
}
clustercfg := ccconf.GetPackageConfig("clusters")
if clustercfg == nil {
return fmt.Errorf("cluster configuration must be present")
}
config.Init(cfg, clustercfg)
config.Init(cfg)
return nil
}
@@ -277,21 +271,14 @@ func initSubsystems() error {
// Initialize job archive
archiveCfg := ccconf.GetPackageConfig("archive")
if archiveCfg == nil {
cclog.Debug("Archive configuration not found, using default archive configuration")
archiveCfg = json.RawMessage(defaultArchiveConfig)
}
if err := archive.Init(archiveCfg, config.Keys.DisableArchive); err != nil {
return fmt.Errorf("initializing archive: %w", err)
}
// Initialize metricdata
// if err := metricdata.Init(); err != nil {
// return fmt.Errorf("initializing metricdata repository: %w", err)
// }
// Initialize upstream metricdata repositories for pull worker
if err := metricdata.InitUpstreamRepos(); err != nil {
return fmt.Errorf("initializing upstream metricdata repositories: %w", err)
}
// Note: metricstore.Init() is called later in runServer() with proper configuration
// Handle database re-initialization
if flagReinitDB {
@@ -316,6 +303,8 @@ func initSubsystems() error {
// Apply tags if requested
if flagApplyTags {
tagger.Init()
if err := tagger.RunTaggers(); err != nil {
return fmt.Errorf("running job taggers: %w", err)
}
@@ -330,9 +319,14 @@ func runServer(ctx context.Context) error {
// Initialize metric store if configuration is provided
mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg != nil {
memorystore.Init(mscfg, &wg)
metricstore.Init(mscfg, &wg)
// Inject repository as NodeProvider to break import cycle
ms := metricstore.GetMemoryStore()
jobRepo := repository.GetJobRepository()
ms.SetNodeProvider(jobRepo)
} else {
cclog.Debug("Metric store configuration not found, skipping memorystore initialization")
return fmt.Errorf("missing metricstore configuration")
}
// Start archiver and task manager
@@ -375,7 +369,7 @@ func runServer(ctx context.Context) error {
case <-ctx.Done():
}
runtimeEnv.SystemdNotifiy(false, "Shutting down ...")
runtime.SystemdNotify(false, "Shutting down ...")
srv.Shutdown(ctx)
util.FsWatcherShutdown()
taskmanager.Shutdown()
@@ -385,24 +379,39 @@ func runServer(ctx context.Context) error {
if os.Getenv(envGOGC) == "" {
debug.SetGCPercent(25)
}
runtimeEnv.SystemdNotifiy(true, "running")
runtime.SystemdNotify(true, "running")
// Wait for completion or error
waitDone := make(chan struct{})
go func() {
wg.Wait()
close(waitDone)
}()
go func() {
<-waitDone
close(errChan)
}()
// Check for server startup errors
// Wait for either:
// 1. An error from server startup
// 2. Completion of all goroutines (normal shutdown or crash)
select {
case err := <-errChan:
// errChan will be closed when waitDone is closed, which happens
// when all goroutines complete (either from normal shutdown or error)
if err != nil {
return err
}
case <-time.After(100 * time.Millisecond):
// Server started successfully, wait for completion
if err := <-errChan; err != nil {
return err
// Give the server 100ms to start and report any immediate startup errors
// After that, just wait for normal shutdown completion
select {
case err := <-errChan:
if err != nil {
return err
}
case <-waitDone:
// Normal shutdown completed
}
}

View File

@@ -29,12 +29,12 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
"github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
"github.com/ClusterCockpit/cc-backend/pkg/nats"
"github.com/ClusterCockpit/cc-backend/web"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/runtimeEnv"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/runtime"
"github.com/gorilla/handlers"
"github.com/gorilla/mux"
httpSwagger "github.com/swaggo/http-swagger"
@@ -345,7 +345,7 @@ func (s *Server) Start(ctx context.Context) error {
// Because this program will want to bind to a privileged port (like 80), the listener must
// be established first, then the user can be changed, and after that,
// the actual http server can be started.
if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
if err := runtime.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
return fmt.Errorf("dropping privileges: %w", err)
}
@@ -381,7 +381,7 @@ func (s *Server) Shutdown(ctx context.Context) {
}
// Archive all the metric store data
memorystore.Shutdown()
metricstore.Shutdown()
// Shutdown archiver with 10 second timeout for fast shutdown
if err := archiver.Shutdown(10 * time.Second); err != nil {

View File

@@ -1,96 +1,23 @@
{
"main": {
"addr": "127.0.0.1:8080",
"short-running-jobs-duration": 300,
"resampling": {
"minimumPoints": 600,
"trigger": 180,
"resolutions": [
240,
60
]
},
"apiAllowedIPs": [
"*"
],
"emission-constant": 317
"addr": "127.0.0.1:8080"
},
"cron": {
"commit-job-worker": "2m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
"commit-job-worker": "1m",
"duration-worker": "3m",
"footprint-worker": "5m"
},
"auth": {
"jwts": {
"max-age": "2000h"
}
},
"nats": {
"address": "nats://0.0.0.0:4222",
"username": "root",
"password": "root"
},
"clusters": [
{
"name": "fritz",
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "alex",
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}
],
"metric-store": {
"checkpoints": {
"file-format": "avro",
"interval": "1h",
"directory": "./var/checkpoints",
"restore": "48h"
},
"archive": {
"interval": "1h",
"directory": "./var/archive"
"interval": "12h"
},
"retention-in-memory": "48h",
"subscriptions": [
{
"subscribe-to": "hpc-nats",
"cluster-tag": "fritz"
},
{
"subscribe-to": "hpc-nats",
"cluster-tag": "alex"
}
]
"memory-cap": 100
}
}

View File

@@ -5,45 +5,62 @@
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
"user": "clustercockpit",
"group": "clustercockpit",
"validate": false,
"apiAllowedIPs": ["*"],
"short-running-jobs-duration": 300,
"enable-job-taggers": true,
"resampling": {
"minimumPoints": 600,
"trigger": 180,
"resolutions": [
240,
60
]
"resolutions": [240, 60]
},
"apiSubjects": {
"subjectJobEvent": "cc.job.event",
"subjectNodeState": "cc.node.state"
}
},
"nats": {
"address": "nats://0.0.0.0:4222",
"username": "root",
"password": "root"
},
"auth": {
"jwts": {
"max-age": "2000h"
}
},
"cron": {
"commit-job-worker": "2m",
"commit-job-worker": "1m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"clusters": [
{
"name": "test",
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
"kind": "s3",
"endpoint": "http://x.x.x.x",
"bucket": "jobarchive",
"accessKey": "xx",
"secretKey": "xx",
"retention": {
"policy": "move",
"age": 365,
"location": "./var/archive"
}
]
},
"metric-store": {
"checkpoints": {
"interval": "12h"
},
"memory-cap": 100,
"retention-in-memory": "48h",
"nats-subscriptions": [
{
"subscribe-to": "hpc-nats",
"cluster-tag": "fritz"
},
{
"subscribe-to": "hpc-nats",
"cluster-tag": "alex"
}
]
},
"ui-file": "ui-config.json"
}

419
configs/tagger/README.md Normal file
View File

@@ -0,0 +1,419 @@
# Job Tagging Configuration
ClusterCockpit provides automatic job tagging functionality to classify and
categorize jobs based on configurable rules. The tagging system consists of two
main components:
1. **Application Detection** - Identifies which application a job is running
2. **Job Classification** - Analyzes job performance characteristics and applies classification tags
## Directory Structure
```
configs/tagger/
├── apps/ # Application detection patterns
│ ├── vasp.txt
│ ├── gromacs.txt
│ └── ...
└── jobclasses/ # Job classification rules
├── parameters.json
├── lowUtilization.json
├── highload.json
└── ...
```
## Activating Tagger Rules
### Step 1: Copy Configuration Files
To activate tagging, review, adapt, and copy the configuration files from
`configs/tagger/` to `var/tagger/`:
```bash
# From the cc-backend root directory
mkdir -p var/tagger
cp -r configs/tagger/apps var/tagger/
cp -r configs/tagger/jobclasses var/tagger/
```
### Step 2: Enable Tagging in Configuration
Add or set the following configuration key in the `main` section of your `config.json`:
```json
{
"enable-job-taggers": true
}
```
**Important**: Automatic tagging is disabled by default. You must explicitly
enable it by setting `enable-job-taggers: true` in the main configuration file.
### Step 3: Restart cc-backend
The tagger system automatically loads configuration from `./var/tagger/` at
startup. After copying the files and enabling the feature, restart cc-backend:
```bash
./cc-backend -server
```
### Step 4: Verify Configuration Loaded
Check the logs for messages indicating successful configuration loading:
```
[INFO] Setup file watch for ./var/tagger/apps
[INFO] Setup file watch for ./var/tagger/jobclasses
```
## How Tagging Works
### Automatic Tagging
When `enable-job-taggers` is set to `true` in the configuration, tags are
automatically applied when:
- **Job Start**: Application detection runs immediately when a job starts
- **Job Stop**: Job classification runs when a job completes
The system analyzes job metadata and metrics to determine appropriate tags.
**Note**: Automatic tagging only works for jobs that start or stop after the
feature is enabled. Existing jobs are not automatically retagged.
### Manual Tagging (Retroactive)
To apply tags to existing jobs in the database, use the `-apply-tags` command
line option:
```bash
./cc-backend -apply-tags
```
This processes all jobs in the database and applies current tagging rules. This
is useful when:
- You have existing jobs that were created before tagging was enabled
- You've added new tagging rules and want to apply them to historical data
- You've modified existing rules and want to re-evaluate all jobs
### Hot Reload
The tagger system watches the configuration directories for changes. You can
modify or add rules without restarting `cc-backend`:
- Changes to `var/tagger/apps/*` are detected automatically
- Changes to `var/tagger/jobclasses/*` are detected automatically
## Application Detection
Application detection identifies which software a job is running by matching
patterns in the job script.
### Configuration Format
Application patterns are stored in text files under `var/tagger/apps/`. Each
file contains one or more regular expression patterns (one per line) that match
against the job script.
**Example: `apps/vasp.txt`**
```
vasp
VASP
```
### How It Works
1. When a job starts, the system retrieves the job script from metadata
2. Each line in the app files is treated as a regex pattern
3. Patterns are matched case-insensitively against the lowercased job script
4. If a match is found, a tag of type `app` with the filename (without extension) is applied
5. Only the first matching application is tagged
### Adding New Applications
1. Create a new file in `var/tagger/apps/` (e.g., `tensorflow.txt`)
2. Add regex patterns, one per line:
```
tensorflow
tf\.keras
import tensorflow
```
3. The file is automatically detected and loaded
**Note**: The tag name will be the filename without the `.txt` extension (e.g., `tensorflow`).
## Job Classification
Job classification analyzes completed jobs based on their metrics and properties
to identify performance issues or characteristics.
### Configuration Format
Job classification rules are defined in JSON files under
`var/tagger/jobclasses/`. Each rule file defines:
- **Metrics required**: Which job metrics to analyze
- **Requirements**: Pre-conditions that must be met
- **Variables**: Computed values used in the rule
- **Rule expression**: Boolean expression that determines if the rule matches
- **Hint template**: Message displayed when the rule matches
### Parameters File
`jobclasses/parameters.json` defines shared threshold values used across multiple rules:
```json
{
"lowcpuload_threshold_factor": 0.9,
"highmemoryusage_threshold_factor": 0.9,
"job_min_duration_seconds": 600.0,
"sampling_interval_seconds": 30.0
}
```
### Rule File Structure
**Example: `jobclasses/lowUtilization.json`**
```json
{
"name": "Low resource utilization",
"tag": "lowutilization",
"parameters": ["job_min_duration_seconds"],
"metrics": ["flops_any", "mem_bw"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "mem_bw_perc",
"expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)"
}
],
"rule": "flops_any.avg < flops_any.limits.alert",
"hint": "Average flop rate {{.flops_any.avg}} falls below threshold {{.flops_any.limits.alert}}"
}
```
#### Field Descriptions
| Field | Description |
| -------------- | ----------------------------------------------------------------------------- |
| `name` | Human-readable description of the rule |
| `tag` | Tag identifier applied when the rule matches |
| `parameters` | List of parameter names from `parameters.json` to include in rule environment |
| `metrics` | List of metrics required for evaluation (must be present in job data) |
| `requirements` | Boolean expressions that must all be true for the rule to be evaluated |
| `variables` | Named expressions computed before evaluating the main rule |
| `rule` | Boolean expression that determines if the job matches this classification |
| `hint` | Go template string for generating a user-visible message |
### Expression Environment
Expressions in `requirements`, `variables`, and `rule` have access to:
**Job Properties:**
- `job.shared` - Shared node allocation type
- `job.duration` - Job runtime in seconds
- `job.numCores` - Number of CPU cores
- `job.numNodes` - Number of nodes
- `job.jobState` - Job completion state
- `job.numAcc` - Number of accelerators
- `job.smt` - SMT setting
**Metric Statistics (for each metric in `metrics`):**
- `<metric>.min` - Minimum value
- `<metric>.max` - Maximum value
- `<metric>.avg` - Average value
- `<metric>.limits.peak` - Peak limit from cluster config
- `<metric>.limits.normal` - Normal threshold
- `<metric>.limits.caution` - Caution threshold
- `<metric>.limits.alert` - Alert threshold
**Parameters:**
- All parameters listed in the `parameters` field
**Variables:**
- All variables defined in the `variables` array
### Expression Language
Rules use the [expr](https://github.com/expr-lang/expr) language for expressions. Supported operations:
- **Arithmetic**: `+`, `-`, `*`, `/`, `%`, `^`
- **Comparison**: `==`, `!=`, `<`, `<=`, `>`, `>=`
- **Logical**: `&&`, `||`, `!`
- **Functions**: Standard math functions (see expr documentation)
### Hint Templates
Hints use Go's `text/template` syntax. Variables from the evaluation environment are accessible:
```
{{.flops_any.avg}} # Access metric average
{{.job.duration}} # Access job property
{{.my_variable}} # Access computed variable
```
### Adding New Classification Rules
1. Create a new JSON file in `var/tagger/jobclasses/` (e.g., `memoryLeak.json`)
2. Define the rule structure:
```json
{
"name": "Memory Leak Detection",
"tag": "memory_leak",
"parameters": ["memory_leak_slope_threshold"],
"metrics": ["mem_used"],
"requirements": ["job.duration > 3600"],
"variables": [
{
"name": "mem_growth",
"expr": "(mem_used.max - mem_used.min) / job.duration"
}
],
"rule": "mem_growth > memory_leak_slope_threshold",
"hint": "Memory usage grew by {{.mem_growth}} per second"
}
```
3. Add any new parameters to `parameters.json`
4. The file is automatically detected and loaded
## Configuration Paths
The tagger system reads from these paths (relative to cc-backend working directory):
- **Application patterns**: `./var/tagger/apps/`
- **Job classification rules**: `./var/tagger/jobclasses/`
These paths are defined as constants in the source code and cannot be changed without recompiling.
## Troubleshooting
### Tags Not Applied
1. **Check tagging is enabled**: Verify `enable-job-taggers: true` is set in `config.json`
2. **Check configuration exists**:
```bash
ls -la var/tagger/apps
ls -la var/tagger/jobclasses
```
3. **Check logs for errors**:
```bash
./cc-backend -server -loglevel debug
```
4. **Verify file permissions**: Ensure cc-backend can read the configuration files
5. **For existing jobs**: Use `./cc-backend -apply-tags` to retroactively tag jobs
### Rules Not Matching
1. **Enable debug logging**: Set `loglevel: debug` to see detailed rule evaluation
2. **Check requirements**: Ensure all requirements in the rule are satisfied
3. **Verify metrics exist**: Classification rules require job metrics to be available
4. **Check metric names**: Ensure metric names match those in your cluster configuration
### File Watch Not Working
If changes to configuration files aren't detected:
1. Restart cc-backend to reload all configuration
2. Check filesystem supports file watching (network filesystems may not)
3. Check logs for file watch setup messages
## Best Practices
1. **Start Simple**: Begin with basic rules and refine based on results
2. **Use Requirements**: Filter out irrelevant jobs early with requirements
3. **Test Incrementally**: Add one rule at a time and verify behavior
4. **Document Rules**: Use descriptive names and clear hint messages
5. **Share Parameters**: Define common thresholds in `parameters.json` for consistency
6. **Version Control**: Keep your `var/tagger/` configuration in version control
7. **Backup Before Changes**: Test new rules on a copy before deploying to production
## Examples
### Simple Application Detection
**File: `var/tagger/apps/python.txt`**
```
python
python3
\.py
```
This detects jobs running Python scripts.
### Complex Classification Rule
**File: `var/tagger/jobclasses/cpuImbalance.json`**
```json
{
"name": "CPU Load Imbalance",
"tag": "cpu_imbalance",
"parameters": ["core_load_imbalance_threshold_factor"],
"metrics": ["cpu_load"],
"requirements": ["job.numCores > 1", "job.duration > 600"],
"variables": [
{
"name": "load_variance",
"expr": "(cpu_load.max - cpu_load.min) / cpu_load.avg"
}
],
"rule": "load_variance > core_load_imbalance_threshold_factor",
"hint": "CPU load varies by {{printf \"%.1f%%\" (load_variance * 100)}} across cores"
}
```
This detects jobs where CPU load is unevenly distributed across cores.
## Reference
### Configuration Options
**Main Configuration (`config.json`)**:
- `enable-job-taggers` (boolean, default: `false`) - Enables automatic job tagging system
- Must be set to `true` to activate automatic tagging on job start/stop events
- Does not affect the `-apply-tags` command line option
**Command Line Options**:
- `-apply-tags` - Apply all tagging rules to existing jobs in the database
- Works independently of `enable-job-taggers` configuration
- Useful for retroactively tagging jobs or re-evaluating with updated rules
### Default Configuration Location
The example configurations are provided in:
- `configs/tagger/apps/` - Example application patterns (16 applications)
- `configs/tagger/jobclasses/` - Example classification rules (3 rules)
Copy these to `var/tagger/` and customize for your environment.
### Tag Types
- `app` - Application tags (e.g., "vasp", "gromacs")
- `jobClass` - Classification tags (e.g., "lowutilization", "highload")
Tags can be queried and filtered in the ClusterCockpit UI and API.

87
go.mod
View File

@@ -10,16 +10,16 @@ tool (
)
require (
github.com/99designs/gqlgen v0.17.84
github.com/ClusterCockpit/cc-lib v1.0.2
github.com/99designs/gqlgen v0.17.85
github.com/ClusterCockpit/cc-lib/v2 v2.1.0
github.com/Masterminds/squirrel v1.5.4
github.com/aws/aws-sdk-go-v2 v1.41.0
github.com/aws/aws-sdk-go-v2/config v1.31.20
github.com/aws/aws-sdk-go-v2/credentials v1.18.24
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2
github.com/coreos/go-oidc/v3 v3.16.0
github.com/expr-lang/expr v1.17.6
github.com/go-co-op/gocron/v2 v2.18.2
github.com/aws/aws-sdk-go-v2 v1.41.1
github.com/aws/aws-sdk-go-v2/config v1.32.6
github.com/aws/aws-sdk-go-v2/credentials v1.19.7
github.com/aws/aws-sdk-go-v2/service/s3 v1.95.0
github.com/coreos/go-oidc/v3 v3.17.0
github.com/expr-lang/expr v1.17.7
github.com/go-co-op/gocron/v2 v2.19.0
github.com/go-ldap/ldap/v3 v3.4.12
github.com/golang-jwt/jwt/v5 v5.3.0
github.com/golang-migrate/migrate/v4 v4.19.1
@@ -31,18 +31,16 @@ require (
github.com/jmoiron/sqlx v1.4.0
github.com/joho/godotenv v1.5.1
github.com/linkedin/goavro/v2 v2.14.1
github.com/mattn/go-sqlite3 v1.14.32
github.com/mattn/go-sqlite3 v1.14.33
github.com/nats-io/nats.go v1.47.0
github.com/prometheus/client_golang v1.23.2
github.com/prometheus/common v0.67.4
github.com/qustavo/sqlhooks/v2 v2.1.0
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
github.com/stretchr/testify v1.11.1
github.com/swaggo/http-swagger v1.3.4
github.com/swaggo/swag v1.16.6
github.com/vektah/gqlparser/v2 v2.5.31
golang.org/x/crypto v0.45.0
golang.org/x/oauth2 v0.32.0
golang.org/x/crypto v0.46.0
golang.org/x/oauth2 v0.34.0
golang.org/x/time v0.14.0
)
@@ -50,22 +48,22 @@ require (
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
github.com/KyleBanks/depth v1.2.1 // indirect
github.com/agnivade/levenshtein v1.2.1 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 // indirect
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 // indirect
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.16 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.7 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.16 // indirect
github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 // indirect
github.com/aws/smithy-go v1.24.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
@@ -85,28 +83,27 @@ require (
github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
github.com/goccy/go-yaml v1.19.0 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/securecookie v1.1.2 // indirect
github.com/gorilla/websocket v1.5.3 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
github.com/jonboulle/clockwork v0.5.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.18.1 // indirect
github.com/klauspost/compress v1.18.2 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/nats-io/nkeys v0.4.11 // indirect
github.com/nats-io/nkeys v0.4.12 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/prometheus/common v0.67.4 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sosodev/duration v1.3.1 // indirect
github.com/stmcginnis/gofish v0.20.0 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/swaggo/files v1.0.1 // indirect
github.com/urfave/cli/v2 v2.27.7 // indirect
@@ -114,13 +111,13 @@ require (
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/mod v0.30.0 // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/sync v0.18.0 // indirect
golang.org/x/sys v0.38.0 // indirect
golang.org/x/text v0.31.0 // indirect
golang.org/x/tools v0.39.0 // indirect
google.golang.org/protobuf v1.36.10 // indirect
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect
golang.org/x/mod v0.31.0 // indirect
golang.org/x/net v0.48.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.39.0 // indirect
golang.org/x/text v0.32.0 // indirect
golang.org/x/tools v0.40.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)

164
go.sum
View File

@@ -1,11 +1,11 @@
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/99designs/gqlgen v0.17.84 h1:iVMdiStgUVx/BFkMb0J5GAXlqfqtQ7bqMCYK6v52kQ0=
github.com/99designs/gqlgen v0.17.84/go.mod h1:qjoUqzTeiejdo+bwUg8unqSpeYG42XrcrQboGIezmFA=
github.com/99designs/gqlgen v0.17.85 h1:EkGx3U2FDcxQm8YDLQSpXIAVmpDyZ3IcBMOJi2nH1S0=
github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU=
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8=
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU=
github.com/ClusterCockpit/cc-lib v1.0.2 h1:ZWn3oZkXgxrr3zSigBdlOOfayZ4Om4xL20DhmritPPg=
github.com/ClusterCockpit/cc-lib v1.0.2/go.mod h1:UGdOvXEnjFqlnPSxtvtFwO6BtXYW6NnXFoud9FtN93k=
github.com/ClusterCockpit/cc-lib/v2 v2.1.0 h1:B6l6h0IjfEuY9DU6aVM3fSsj24lQ1eudXK9QTKmJjqg=
github.com/ClusterCockpit/cc-lib/v2 v2.1.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM=
@@ -14,6 +14,7 @@ github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObk
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM=
github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU=
github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI=
@@ -22,52 +23,57 @@ github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNg
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
github.com/aws/aws-sdk-go-v2 v1.41.0 h1:tNvqh1s+v0vFYdA1xq0aOJH+Y5cRyZ5upu6roPgPKd4=
github.com/aws/aws-sdk-go-v2 v1.41.0/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 h1:DHctwEM8P8iTXFxC/QK0MRjwEpWQeM9yzidCRjldUz0=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3/go.mod h1:xdCzcZEtnSTKVDOmUZs4l/j3pSV6rpo1WXl5ugNsL8Y=
github.com/aws/aws-sdk-go-v2/config v1.31.20 h1:/jWF4Wu90EhKCgjTdy1DGxcbcbNrjfBHvksEL79tfQc=
github.com/aws/aws-sdk-go-v2/config v1.31.20/go.mod h1:95Hh1Tc5VYKL9NJ7tAkDcqeKt+MCXQB1hQZaRdJIZE0=
github.com/aws/aws-sdk-go-v2/credentials v1.18.24 h1:iJ2FmPT35EaIB0+kMa6TnQ+PwG5A1prEdAw+PsMzfHg=
github.com/aws/aws-sdk-go-v2/credentials v1.18.24/go.mod h1:U91+DrfjAiXPDEGYhh/x29o4p0qHX5HDqG7y5VViv64=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 h1:T1brd5dR3/fzNFAQch/iBKeX07/ffu/cLu+q+RuzEWk=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13/go.mod h1:Peg/GBAQ6JDt+RoBf4meB1wylmAipb7Kg2ZFakZTlwk=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 h1:a+8/MLcWlIxo1lF9xaGt3J/u3yOZx+CdSveSNwjhD40=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13/go.mod h1:oGnKwIYZ4XttyU2JWxFrwvhF6YKiK/9/wmE3v3Iu9K8=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 h1:HBSI2kDkMdWz4ZM7FjwE7e/pWDEZ+nR95x8Ztet1ooY=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13/go.mod h1:YE94ZoDArI7awZqJzBAZ3PDD2zSfuP7w6P2knOzIn8M=
github.com/aws/aws-sdk-go-v2 v1.41.1 h1:ABlyEARCDLN034NhxlRUSZr4l71mh+T5KAeGh6cerhU=
github.com/aws/aws-sdk-go-v2 v1.41.1/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 h1:489krEF9xIGkOaaX3CE/Be2uWjiXrkCH6gUX+bZA/BU=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4/go.mod h1:IOAPF6oT9KCsceNTvvYMNHy0+kMF8akOjeDvPENWxp4=
github.com/aws/aws-sdk-go-v2/config v1.32.6 h1:hFLBGUKjmLAekvi1evLi5hVvFQtSo3GYwi+Bx4lpJf8=
github.com/aws/aws-sdk-go-v2/config v1.32.6/go.mod h1:lcUL/gcd8WyjCrMnxez5OXkO3/rwcNmvfno62tnXNcI=
github.com/aws/aws-sdk-go-v2/credentials v1.19.7 h1:tHK47VqqtJxOymRrNtUXN5SP/zUTvZKeLx4tH6PGQc8=
github.com/aws/aws-sdk-go-v2/credentials v1.19.7/go.mod h1:qOZk8sPDrxhf+4Wf4oT2urYJrYt3RejHSzgAquYeppw=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 h1:I0GyV8wiYrP8XpA70g1HBcQO1JlQxCMTW9npl5UbDHY=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17/go.mod h1:tyw7BOl5bBe/oqvoIeECFJjMdzXoa/dfVz3QQ5lgHGA=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 h1:xOLELNKGp2vsiteLsvLPwxC+mYmO6OZ8PYgiuPJzF8U=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17/go.mod h1:5M5CI3D12dNOtH3/mk6minaRwI2/37ifCURZISxA/IQ=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 h1:WWLqlh79iO48yLkj1v3ISRNiv+3KdQoZ6JWyfcsyQik=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17/go.mod h1:EhG22vHRrvF8oXSTYStZhJc1aUgKtnJe+aOiFEV90cM=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 h1:eg/WYAa12vqTphzIdWMzqYRVKKnCboVPRlvaybNCqPA=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13/go.mod h1:/FDdxWhz1486obGrKKC1HONd7krpk38LBt+dutLcN9k=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 h1:x2Ibm/Af8Fi+BH+Hsn9TXGdT+hKbDd5XOTZxTMxDk7o=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3/go.mod h1:IW1jwyrQgMdhisceG8fQLmQIydcT/jWY21rFhzgaKwo=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 h1:NvMjwvv8hpGUILarKw7Z4Q0w1H9anXKsesMxtw++MA4=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4/go.mod h1:455WPHSwaGj2waRSpQp7TsnpOnBfw8iDfPfbwl7KPJE=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 h1:kDqdFvMY4AtKoACfzIGD8A0+hbT41KTKF//gq7jITfM=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13/go.mod h1:lmKuogqSU3HzQCwZ9ZtcqOc5XGMqtDK7OIc2+DxiUEg=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 h1:zhBJXdhWIFZ1acfDYIhu4+LCzdUS2Vbcum7D01dXlHQ=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13/go.mod h1:JaaOeCE368qn2Hzi3sEzY6FgAZVCIYcC2nwbro2QCh8=
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2 h1:DhdbtDl4FdNlj31+xiRXANxEE+eC7n8JQz+/ilwQ8Uc=
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2/go.mod h1:+wArOOrcHUevqdto9k1tKOF5++YTe9JEcPSc9Tx2ZSw=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 h1:NjShtS1t8r5LUfFVtFeI8xLAHQNTa7UI0VawXlrBMFQ=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3/go.mod h1:fKvyjJcz63iL/ftA6RaM8sRCtN4r4zl4tjL3qw5ec7k=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 h1:gTsnx0xXNQ6SBbymoDvcoRHL+q4l/dAFsQuKfDWSaGc=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7/go.mod h1:klO+ejMvYsB4QATfEOIXk8WAEwN4N0aBfJpvC+5SZBo=
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 h1:HK5ON3KmQV2HcAunnx4sKLB9aPf3gKGwVAf7xnx0QT0=
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2/go.mod h1:E19xDjpzPZC7LS2knI9E6BaRFDK43Eul7vd6rSq2HWk=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.16 h1:CjMzUs78RDDv4ROu3JnJn/Ig1r6ZD7/T2DXLLRpejic=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.16/go.mod h1:uVW4OLBqbJXSHJYA9svT9BluSvvwbzLQ2Crf6UPzR3c=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 h1:0ryTNEdJbzUCEWkVXEXoqlXV72J5keC1GvILMOuD00E=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4/go.mod h1:HQ4qwNZh32C3CBeO6iJLQlgtMzqeG17ziAA/3KDJFow=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.7 h1:DIBqIrJ7hv+e4CmIk2z3pyKT+3B6qVMgRsawHiR3qso=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.7/go.mod h1:vLm00xmBke75UmpNvOcZQ/Q30ZFjbczeLFqGx5urmGo=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 h1:RuNSMoozM8oXlgLG/n6WLaFGoea7/CddrCfIiSA+xdY=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17/go.mod h1:F2xxQ9TZz5gDWsclCtPQscGpP0VUOc8RqgFM3vDENmU=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.16 h1:NSbvS17MlI2lurYgXnCOLvCFX38sBW4eiVER7+kkgsU=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.16/go.mod h1:SwT8Tmqd4sA6G1qaGdzWCJN99bUmPGHfRwwq3G5Qb+A=
github.com/aws/aws-sdk-go-v2/service/s3 v1.95.0 h1:MIWra+MSq53CFaXXAywB2qg9YvVZifkk6vEGl/1Qor0=
github.com/aws/aws-sdk-go-v2/service/s3 v1.95.0/go.mod h1:79S2BdqCJpScXZA2y+cpZuocWsjGjJINyXnOsf5DTz8=
github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 h1:VrhDvQib/i0lxvr3zqlUwLwJP4fpmpyD9wYG1vfSu+Y=
github.com/aws/aws-sdk-go-v2/service/signin v1.0.5/go.mod h1:k029+U8SY30/3/ras4G/Fnv/b88N4mAfliNn08Dem4M=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 h1:v6EiMvhEYBoHABfbGB4alOYmCIrcgyPPiBE1wZAEbqk=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.9/go.mod h1:yifAsgBxgJWn3ggx70A3urX2AN49Y5sJTD1UQFlfqBw=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13 h1:gd84Omyu9JLriJVCbGApcLzVR3XtmC4ZDPcAI6Ftvds=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13/go.mod h1:sTGThjphYE4Ohw8vJiRStAcu3rbjtXRsdNB0TvZ5wwo=
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 h1:5fFjR/ToSOzB2OQ/XqWpZBmNvmP/pJ1jOWYlFDJTjRQ=
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6/go.mod h1:qgFDZQSD/Kys7nJnVqYlWKnh0SSdMjAi0uSwON4wgYQ=
github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk=
github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/coreos/go-oidc/v3 v3.16.0 h1:qRQUCFstKpXwmEjDQTIbyY/5jF00+asXzSkmkoa/mow=
github.com/coreos/go-oidc/v3 v3.16.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8=
github.com/coreos/go-oidc/v3 v3.17.0 h1:hWBGaQfbi0iVviX4ibC7bk8OKT5qNr4klBaCHVNvehc=
github.com/coreos/go-oidc/v3 v3.17.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8=
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
@@ -77,8 +83,8 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo=
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
github.com/expr-lang/expr v1.17.6 h1:1h6i8ONk9cexhDmowO/A64VPxHScu7qfSl2k8OlINec=
github.com/expr-lang/expr v1.17.6/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/expr-lang/expr v1.17.7 h1:Q0xY/e/2aCIp8g9s/LGvMDCC5PxYlvHgDZRQ4y16JX8=
github.com/expr-lang/expr v1.17.7/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
@@ -89,8 +95,8 @@ github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 h1:BP4M0CvQ4S3TGls2FvczZtj5Re/2ZzkV9VwqPHH/3Bo=
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0=
github.com/go-co-op/gocron/v2 v2.18.2 h1:+5VU41FUXPWSPKLXZQ/77SGzUiPCcakU0v7ENc2H20Q=
github.com/go-co-op/gocron/v2 v2.18.2/go.mod h1:Zii6he+Zfgy5W9B+JKk/KwejFOW0kZTFvHtwIpR4aBI=
github.com/go-co-op/gocron/v2 v2.19.0 h1:OKf2y6LXPs/BgBI2fl8PxUpNAI1DA9Mg+hSeGOS38OU=
github.com/go-co-op/gocron/v2 v2.19.0/go.mod h1:5lEiCKk1oVJV39Zg7/YG10OnaVrDAV5GGR6O0663k6U=
github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs=
github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08=
github.com/go-ldap/ldap/v3 v3.4.12 h1:1b81mv7MagXZ7+1r7cLTWmyuTqVqdwbtJSjC0DAp9s4=
@@ -140,7 +146,8 @@ github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark=
@@ -190,12 +197,9 @@ github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/jonboulle/clockwork v0.5.0 h1:Hyh9A8u51kptdkR+cqRpT1EebBwTn1oK9YfGYbdFz6I=
github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7XN3SzBPjZF60=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/klauspost/compress v1.18.1 h1:bcSGx7UbpBqMChDtsF28Lw6v/G94LPrrbMbdC3JH2co=
github.com/klauspost/compress v1.18.1/go.mod h1:ZQFFVG+MdnR0P+l6wpXgIL4NTtwiKIdBnrBd8Nrxr+0=
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk=
github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
@@ -214,27 +218,27 @@ github.com/linkedin/goavro/v2 v2.14.1 h1:/8VjDpd38PRsy02JS0jflAu7JZPfJcGTwqWgMkF
github.com/linkedin/goavro/v2 v2.14.1/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk=
github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/mattn/go-sqlite3 v1.14.33 h1:A5blZ5ulQo2AtayQ9/limgHEkFreKj1Dv226a1K73s0=
github.com/mattn/go-sqlite3 v1.14.33/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
github.com/nats-io/nats.go v1.47.0 h1:YQdADw6J/UfGUd2Oy6tn4Hq6YHxCaJrVKayxxFqYrgM=
github.com/nats-io/nats.go v1.47.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0=
github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE=
github.com/nats-io/nkeys v0.4.12 h1:nssm7JKOG9/x4J8II47VWCL1Ds29avyiQDRn0ckMvDc=
github.com/nats-io/nkeys v0.4.12/go.mod h1:MT59A1HYcjIcyQDJStTfaOY6vhy9XTUjOFo+SVsvpBg=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@@ -250,6 +254,7 @@ github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAi
github.com/qustavo/sqlhooks/v2 v2.1.0/go.mod h1:aMREyKo7fOKTwiLuWPsaHRXEmtqG4yREztO0idF83AU=
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
@@ -260,6 +265,9 @@ github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4=
github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg=
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
github.com/stmcginnis/gofish v0.20.0 h1:hH2V2Qe898F2wWT1loApnkDUrXXiLKqbSlMaH3Y1n08=
github.com/stmcginnis/gofish v0.20.0/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
@@ -294,33 +302,33 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q=
golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4=
golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc=
golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI=
golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY=
golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I=
golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
@@ -328,19 +336,19 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ=
golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ=
golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA=
golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=

View File

@@ -52,51 +52,51 @@ models:
- github.com/99designs/gqlgen/graphql.Int64
- github.com/99designs/gqlgen/graphql.Int32
Job:
model: "github.com/ClusterCockpit/cc-lib/schema.Job"
model: "github.com/ClusterCockpit/cc-lib/v2/schema.Job"
fields:
tags:
resolver: true
metaData:
resolver: true
Cluster:
model: "github.com/ClusterCockpit/cc-lib/schema.Cluster"
model: "github.com/ClusterCockpit/cc-lib/v2/schema.Cluster"
fields:
partitions:
resolver: true
# Node:
# model: "github.com/ClusterCockpit/cc-lib/schema.Node"
# model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node"
# fields:
# metaData:
# resolver: true
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricScope" }
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricValue" }
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricScope" }
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricValue" }
JobStatistics:
{ model: "github.com/ClusterCockpit/cc-lib/schema.JobStatistics" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobStatistics" }
GlobalMetricListItem:
{ model: "github.com/ClusterCockpit/cc-lib/schema.GlobalMetricListItem" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.GlobalMetricListItem" }
ClusterSupport:
{ model: "github.com/ClusterCockpit/cc-lib/schema.ClusterSupport" }
Tag: { model: "github.com/ClusterCockpit/cc-lib/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-lib/schema.Resource" }
JobState: { model: "github.com/ClusterCockpit/cc-lib/schema.JobState" }
Node: { model: "github.com/ClusterCockpit/cc-lib/schema.Node" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.ClusterSupport" }
Tag: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Resource" }
JobState: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobState" }
Node: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node" }
SchedulerState:
{ model: "github.com/ClusterCockpit/cc-lib/schema.SchedulerState" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.SchedulerState" }
HealthState:
{ model: "github.com/ClusterCockpit/cc-lib/schema.MonitoringState" }
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-lib/schema.Series" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MonitoringState" }
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Series" }
MetricStatistics:
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricStatistics" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricStatistics" }
MetricConfig:
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricConfig" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricConfig" }
SubClusterConfig:
{ model: "github.com/ClusterCockpit/cc-lib/schema.SubClusterConfig" }
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/schema.Accelerator" }
Topology: { model: "github.com/ClusterCockpit/cc-lib/schema.Topology" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubClusterConfig" }
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Accelerator" }
Topology: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Topology" }
FilterRanges:
{ model: "github.com/ClusterCockpit/cc-lib/schema.FilterRanges" }
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/schema.SubCluster" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-lib/schema.Unit" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.FilterRanges" }
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubCluster" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Unit" }

View File

@@ -17,53 +17,44 @@ import (
"strings"
"testing"
"time"
"sync"
"github.com/ClusterCockpit/cc-backend/internal/api"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/gorilla/mux"
_ "github.com/mattn/go-sqlite3"
)
func setup(t *testing.T) *api.RestAPI {
repository.ResetConnection()
const testconfig = `{
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"apiAllowedIPs": [
"*"
]
},
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"apiAllowedIPs": [
"*"
]
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2m"
"jwts": {
"max-age": "2m"
}
}
},
"clusters": [
{
"name": "testcluster",
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
}
]
}`
const testclusterJSON = `{
"name": "testcluster",
@@ -155,11 +146,7 @@ func setup(t *testing.T) *api.RestAPI {
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
config.Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
config.Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
@@ -171,13 +158,7 @@ func setup(t *testing.T) *api.RestAPI {
t.Fatal(err)
}
// Initialize memorystore (optional - will return nil if not configured)
// For this test, we don't initialize it to test the nil handling
mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg != nil {
var wg sync.WaitGroup
memorystore.Init(mscfg, &wg)
}
// metricstore initialization removed - it's initialized via callback in tests
archiver.Start(repository.GetJobRepository(), context.Background())
@@ -194,30 +175,45 @@ func setup(t *testing.T) *api.RestAPI {
}
func cleanup() {
// Gracefully shutdown archiver with timeout
if err := archiver.Shutdown(5 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
}
// Shutdown memorystore if it was initialized
memorystore.Shutdown()
}
/*
* This function starts a job, stops it, and tests the REST API.
* Do not run sub-tests in parallel! Tests should not be run in parallel at all, because
* at least `setup` modifies global state.
* This function starts a job, stops it, and then reads its data from the job-archive.
* Do not run sub-tests in parallel! Tests should not be run in parallel at all, because
* at least `setup` modifies global state.
*/
func TestRestApi(t *testing.T) {
restapi := setup(t)
t.Cleanup(cleanup)
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
},
},
}
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
r := mux.NewRouter()
r.PathPrefix("/api").Subrouter()
r.StrictSlash(true)
restapi.MountAPIRoutes(r)
var TestJobId int64 = 123
var TestJobID int64 = 123
TestClusterName := "testcluster"
var TestStartTime int64 = 123456789
@@ -265,12 +261,18 @@ func TestRestApi(t *testing.T) {
if response.StatusCode != http.StatusCreated {
t.Fatal(response.Status, recorder.Body.String())
}
// resolver := graph.GetResolverInstance()
restapi.JobRepository.SyncJobs()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
// job.Tags, err = resolver.Job().Tags(ctx, job)
// if err != nil {
// t.Fatal(err)
// }
if job.JobID != 123 ||
job.User != "testuser" ||
job.Project != "testproj" ||
@@ -288,6 +290,10 @@ func TestRestApi(t *testing.T) {
job.StartTime != 123456789 {
t.Fatalf("unexpected job properties: %#v", job)
}
// if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" {
// t.Fatalf("unexpected tags: %#v", job.Tags)
// }
}); !ok {
return
}
@@ -301,6 +307,7 @@ func TestRestApi(t *testing.T) {
"stopTime": 123457789
}`
var stoppedJob *schema.Job
if ok := t.Run("StopJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
recorder := httptest.NewRecorder()
@@ -314,7 +321,7 @@ func TestRestApi(t *testing.T) {
}
// Archiving happens asynchronously, will be completed in cleanup
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
@@ -336,12 +343,21 @@ func TestRestApi(t *testing.T) {
t.Fatalf("unexpected job.metaData: %#v", job.MetaData)
}
stoppedJob = job
}); !ok {
return
}
// Note: We skip the CheckArchive test because without memorystore initialized,
// archiving will fail gracefully. This test now focuses on the REST API itself.
t.Run("CheckArchive", func(t *testing.T) {
data, err := metricdispatch.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(data, testData) {
t.Fatal("unexpected data fetched from archive")
}
})
t.Run("CheckDoubleStart", func(t *testing.T) {
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!

View File

@@ -13,7 +13,7 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// GetClustersAPIResponse model
@@ -27,7 +27,7 @@ type GetClustersAPIResponse struct {
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
// @produce json
// @param cluster query string false "Job Cluster"
// @success 200 {object} api.GetClustersApiResponse "Array of clusters"
// @success 200 {object} api.GetClustersAPIResponse "Array of clusters"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"

File diff suppressed because it is too large Load Diff

View File

@@ -22,11 +22,11 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/gorilla/mux"
)
@@ -104,7 +104,7 @@ type JobMetricWithName struct {
// @param items-per-page query int false "Items per page (Default: 25)"
// @param page query int false "Page Number (Default: 1)"
// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response"
// @success 200 {object} api.GetJobsApiResponse "Job array and page info"
// @success 200 {object} api.GetJobsAPIResponse "Job array and page info"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -232,7 +232,7 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
// @produce json
// @param id path int true "Database ID of Job"
// @param all-metrics query bool false "Include all available metrics"
// @success 200 {object} api.GetJobApiResponse "Job resource"
// @success 200 {object} api.GetJobAPIResponse "Job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -293,7 +293,7 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
}
if r.URL.Query().Get("all-metrics") == "true" {
data, err = metricdispatcher.LoadData(job, nil, scopes, r.Context(), resolution)
data, err = metricdispatch.LoadData(job, nil, scopes, r.Context(), resolution)
if err != nil {
cclog.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster)
return
@@ -324,8 +324,8 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
// @accept json
// @produce json
// @param id path int true "Database ID of Job"
// @param request body api.GetJobApiRequest true "Array of metric names"
// @success 200 {object} api.GetJobApiResponse "Job resource"
// @param request body api.GetJobAPIRequest true "Array of metric names"
// @success 200 {object} api.GetJobAPIResponse "Job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -389,7 +389,7 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
resolution = max(resolution, mc.Timestep)
}
data, err := metricdispatcher.LoadData(job, metrics, scopes, r.Context(), resolution)
data, err := metricdispatch.LoadData(job, metrics, scopes, r.Context(), resolution)
if err != nil {
cclog.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster)
return
@@ -478,7 +478,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
// @accept json
// @produce json
// @param id path int true "Job Database ID"
// @param request body api.TagJobApiRequest true "Array of tag-objects to add"
// @param request body api.TagJobAPIRequest true "Array of tag-objects to add"
// @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -542,7 +542,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
// @accept json
// @produce json
// @param id path int true "Job Database ID"
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
// @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -606,7 +606,7 @@ func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
// @description Tag wills be removed from respective archive files.
// @accept json
// @produce plain
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
// @success 200 {string} string "Success Response"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -650,7 +650,7 @@ func (api *RestAPI) removeTags(rw http.ResponseWriter, r *http.Request) {
// @accept json
// @produce json
// @param request body schema.Job true "Job to add"
// @success 201 {object} api.DefaultApiResponse "Job added successfully"
// @success 201 {object} api.DefaultAPIResponse "Job added successfully"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -728,7 +728,7 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
// @description Job to stop is specified by request body. All fields are required in this case.
// @description Returns full job resource information according to 'Job' scheme.
// @produce json
// @param request body api.StopJobApiRequest true "All fields required"
// @param request body api.StopJobAPIRequest true "All fields required"
// @success 200 {object} schema.Job "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -754,7 +754,6 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
return
}
// cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req)
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
if err != nil {
// Try cached jobs if not found in main repository
@@ -776,7 +775,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
// @description Job to remove is specified by database ID. This will not remove the job from the job archive.
// @produce json
// @param id path int true "Database ID of Job"
// @success 200 {object} api.DefaultApiResponse "Success message"
// @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -820,8 +819,8 @@ func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
// @description Job to delete is specified by request body. All fields are required in this case.
// @accept json
// @produce json
// @param request body api.DeleteJobApiRequest true "All fields required"
// @success 200 {object} api.DefaultApiResponse "Success message"
// @param request body api.DeleteJobAPIRequest true "All fields required"
// @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -873,7 +872,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.
// @produce json
// @param ts path int true "Unix epoch timestamp"
// @success 200 {object} api.DefaultApiResponse "Success message"
// @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"

View File

@@ -15,8 +15,8 @@ import (
"strconv"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-backend/internal/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/influxdata/line-protocol/v2/lineprotocol"
)
@@ -58,7 +58,7 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
return
}
ms := memorystore.GetMemoryStore()
ms := metricstore.GetMemoryStore()
n := 0
for _, sel := range selectors {
bn, err := ms.Free(sel, to)
@@ -97,9 +97,9 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) {
return
}
ms := memorystore.GetMemoryStore()
ms := metricstore.GetMemoryStore()
dec := lineprotocol.NewDecoderWithBytes(bytes)
if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
if err := metricstore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
cclog.Errorf("/api/write error: %s", err.Error())
handleError(err, http.StatusBadRequest, rw)
return
@@ -129,7 +129,7 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
selector = strings.Split(raw, ":")
}
ms := memorystore.GetMemoryStore()
ms := metricstore.GetMemoryStore()
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
@@ -162,7 +162,7 @@ func metricsHealth(rw http.ResponseWriter, r *http.Request) {
selector := []string{rawCluster, rawNode}
ms := memorystore.GetMemoryStore()
ms := metricstore.GetMemoryStore()
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return

View File

@@ -6,9 +6,9 @@
package api
import (
"bytes"
"database/sql"
"encoding/json"
"strings"
"sync"
"time"
@@ -17,12 +17,48 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/nats"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"github.com/ClusterCockpit/cc-lib/v2/receivers"
"github.com/ClusterCockpit/cc-lib/v2/schema"
influx "github.com/influxdata/line-protocol/v2/lineprotocol"
)
// NatsAPI provides NATS subscription-based handlers for Job and Node operations.
// It mirrors the functionality of the REST API but uses NATS messaging.
// It mirrors the functionality of the REST API but uses NATS messaging with
// InfluxDB line protocol as the message format.
//
// # Message Format
//
// All NATS messages use InfluxDB line protocol format (https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/)
// with the following structure:
//
// measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp
//
// # Job Events
//
// Job start/stop events use the "job" measurement with a "function" tag to distinguish operations:
//
// job,function=start_job event="{...JSON payload...}" <timestamp>
// job,function=stop_job event="{...JSON payload...}" <timestamp>
//
// The JSON payload in the "event" field follows the schema.Job or StopJobAPIRequest structure.
//
// Example job start message:
//
// job,function=start_job event="{\"jobId\":1001,\"user\":\"testuser\",\"cluster\":\"testcluster\",...}" 1234567890000000000
//
// # Node State Events
//
// Node state updates use the "nodestate" measurement with cluster information:
//
// nodestate event="{...JSON payload...}" <timestamp>
//
// The JSON payload follows the UpdateNodeStatesRequest structure.
//
// Example node state message:
//
// nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node01\",\"states\":[\"idle\"]}]}" 1234567890000000000
type NatsAPI struct {
JobRepository *repository.JobRepository
// RepositoryMutex protects job creation operations from race conditions
@@ -50,11 +86,7 @@ func (api *NatsAPI) StartSubscriptions() error {
s := config.Keys.APISubjects
if err := client.Subscribe(s.SubjectJobStart, api.handleStartJob); err != nil {
return err
}
if err := client.Subscribe(s.SubjectJobStop, api.handleStopJob); err != nil {
if err := client.Subscribe(s.SubjectJobEvent, api.handleJobEvent); err != nil {
return err
}
@@ -67,26 +99,96 @@ func (api *NatsAPI) StartSubscriptions() error {
return nil
}
// processJobEvent routes job event messages to the appropriate handler based on the "function" tag.
// Validates that required tags and fields are present before processing.
func (api *NatsAPI) processJobEvent(msg lp.CCMessage) {
function, ok := msg.GetTag("function")
if !ok {
cclog.Errorf("Job event is missing required tag 'function': measurement=%s", msg.Name())
return
}
switch function {
case "start_job":
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Job start event is missing event field with JSON payload")
return
}
api.handleStartJob(v)
case "stop_job":
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Job stop event is missing event field with JSON payload")
return
}
api.handleStopJob(v)
default:
cclog.Warnf("Unknown job event function '%s', expected 'start_job' or 'stop_job'", function)
}
}
// handleJobEvent processes job-related messages received via NATS using InfluxDB line protocol.
// The message must be in line protocol format with measurement="job" and include:
// - tag "function" with value "start_job" or "stop_job"
// - field "event" containing JSON payload (schema.Job or StopJobAPIRequest)
//
// Example: job,function=start_job event="{\"jobId\":1001,...}" 1234567890000000000
func (api *NatsAPI) handleJobEvent(subject string, data []byte) {
if len(data) == 0 {
cclog.Warnf("NATS %s: received empty message", subject)
return
}
d := influx.NewDecoderWithBytes(data)
for d.Next() {
m, err := receivers.DecodeInfluxMessage(d)
if err != nil {
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
return
}
if !m.IsEvent() {
cclog.Debugf("NATS %s: received non-event message, skipping", subject)
continue
}
if m.Name() == "job" {
api.processJobEvent(m)
} else {
cclog.Debugf("NATS %s: unexpected measurement name '%s', expected 'job'", subject, m.Name())
}
}
}
// handleStartJob processes job start messages received via NATS.
// Expected JSON payload follows the schema.Job structure.
func (api *NatsAPI) handleStartJob(subject string, data []byte) {
// The payload parameter contains JSON following the schema.Job structure.
// Jobs are validated, checked for duplicates, and inserted into the database.
func (api *NatsAPI) handleStartJob(payload string) {
if payload == "" {
cclog.Error("NATS start job: payload is empty")
return
}
req := schema.Job{
Shared: "none",
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
dec := json.NewDecoder(bytes.NewReader(data))
dec := json.NewDecoder(strings.NewReader(payload))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err)
cclog.Errorf("NATS start job: parsing request failed: %v", err)
return
}
cclog.Debugf("NATS %s: %s", subject, req.GoString())
cclog.Debugf("NATS start job: %s", req.GoString())
req.State = schema.JobStateRunning
if err := importer.SanityChecks(&req); err != nil {
cclog.Errorf("NATS %s: sanity check failed: %v", subject, err)
cclog.Errorf("NATS start job: sanity check failed: %v", err)
return
}
@@ -96,14 +198,14 @@ func (api *NatsAPI) handleStartJob(subject string, data []byte) {
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
if err != nil && err != sql.ErrNoRows {
cclog.Errorf("NATS %s: checking for duplicate failed: %v", subject, err)
cclog.Errorf("NATS start job: checking for duplicate failed: %v", err)
return
}
if err == nil {
for _, job := range jobs {
if (req.StartTime - job.StartTime) < secondsPerDay {
cclog.Errorf("NATS %s: job with jobId %d, cluster %s already exists (dbid: %d)",
subject, req.JobID, req.Cluster, job.ID)
cclog.Errorf("NATS start job: job with jobId %d, cluster %s already exists (dbid: %d)",
req.JobID, req.Cluster, job.ID)
return
}
}
@@ -111,14 +213,14 @@ func (api *NatsAPI) handleStartJob(subject string, data []byte) {
id, err := api.JobRepository.Start(&req)
if err != nil {
cclog.Errorf("NATS %s: insert into database failed: %v", subject, err)
cclog.Errorf("NATS start job: insert into database failed: %v", err)
return
}
unlockOnce.Do(api.RepositoryMutex.Unlock)
for _, tag := range req.Tags {
if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
cclog.Errorf("NATS %s: adding tag to new job %d failed: %v", subject, id, err)
cclog.Errorf("NATS start job: adding tag to new job %d failed: %v", id, err)
return
}
}
@@ -128,19 +230,24 @@ func (api *NatsAPI) handleStartJob(subject string, data []byte) {
}
// handleStopJob processes job stop messages received via NATS.
// Expected JSON payload follows the StopJobAPIRequest structure.
func (api *NatsAPI) handleStopJob(subject string, data []byte) {
// The payload parameter contains JSON following the StopJobAPIRequest structure.
// The job is marked as stopped in the database and archiving is triggered if monitoring is enabled.
func (api *NatsAPI) handleStopJob(payload string) {
if payload == "" {
cclog.Error("NATS stop job: payload is empty")
return
}
var req StopJobAPIRequest
dec := json.NewDecoder(bytes.NewReader(data))
dec := json.NewDecoder(strings.NewReader(payload))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err)
cclog.Errorf("NATS job stop: parsing request failed: %v", err)
return
}
if req.JobID == nil {
cclog.Errorf("NATS %s: the field 'jobId' is required", subject)
cclog.Errorf("NATS job stop: the field 'jobId' is required")
return
}
@@ -148,28 +255,28 @@ func (api *NatsAPI) handleStopJob(subject string, data []byte) {
if err != nil {
cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
if cachedErr != nil {
cclog.Errorf("NATS %s: finding job failed: %v (cached lookup also failed: %v)",
subject, err, cachedErr)
cclog.Errorf("NATS job stop: finding job failed: %v (cached lookup also failed: %v)",
err, cachedErr)
return
}
job = cachedJob
}
if job.State != schema.JobStateRunning {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
subject, job.JobID, job.ID, job.Cluster, job.State)
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
job.JobID, job.ID, job.Cluster, job.State)
return
}
if job.StartTime > req.StopTime {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
subject, job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
return
}
if req.State != "" && !req.State.Valid() {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: invalid job state: %#v",
subject, job.JobID, job.ID, job.Cluster, req.State)
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: invalid job state: %#v",
job.JobID, job.ID, job.Cluster, req.State)
return
} else if req.State == "" {
req.State = schema.JobStateCompleted
@@ -182,8 +289,8 @@ func (api *NatsAPI) handleStopJob(subject string, data []byte) {
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
subject, job.JobID, job.ID, job.Cluster, job.State, err)
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
job.JobID, job.ID, job.Cluster, job.State, err)
return
}
}
@@ -198,15 +305,21 @@ func (api *NatsAPI) handleStopJob(subject string, data []byte) {
archiver.TriggerArchiving(job)
}
// handleNodeState processes node state update messages received via NATS.
// Expected JSON payload follows the UpdateNodeStatesRequest structure.
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
// processNodestateEvent extracts and processes node state data from the InfluxDB message.
// Updates node states in the repository for all nodes in the payload.
func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Nodestate event is missing event field with JSON payload")
return
}
var req UpdateNodeStatesRequest
dec := json.NewDecoder(bytes.NewReader(data))
dec := json.NewDecoder(strings.NewReader(v))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err)
cclog.Errorf("NATS nodestate: parsing request failed: %v", err)
return
}
@@ -224,8 +337,44 @@ func (api *NatsAPI) handleNodeState(subject string, data []byte) {
JobsRunning: node.JobsRunning,
}
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
cclog.Errorf("NATS nodestate: updating node state for %s on %s failed: %v",
node.Hostname, req.Cluster, err)
}
}
cclog.Debugf("NATS %s: updated %d node states for cluster %s", subject, len(req.Nodes), req.Cluster)
cclog.Debugf("NATS nodestate: updated %d node states for cluster %s", len(req.Nodes), req.Cluster)
}
// handleNodeState processes node state update messages received via NATS using InfluxDB line protocol.
// The message must be in line protocol format with measurement="nodestate" and include:
// - field "event" containing JSON payload (UpdateNodeStatesRequest)
//
// Example: nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[...]}" 1234567890000000000
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
if len(data) == 0 {
cclog.Warnf("NATS %s: received empty message", subject)
return
}
d := influx.NewDecoderWithBytes(data)
for d.Next() {
m, err := receivers.DecodeInfluxMessage(d)
if err != nil {
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
return
}
if !m.IsEvent() {
cclog.Warnf("NATS %s: received non-event message, skipping", subject)
continue
}
if m.Name() == "nodestate" {
api.processNodestateEvent(m)
} else {
cclog.Warnf("NATS %s: unexpected measurement name '%s', expected 'nodestate'", subject, m.Name())
}
}
}

947
internal/api/nats_test.go Normal file
View File

@@ -0,0 +1,947 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"context"
"database/sql"
"encoding/json"
"fmt"
"os"
"path/filepath"
"testing"
"time"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"github.com/ClusterCockpit/cc-lib/v2/schema"
_ "github.com/mattn/go-sqlite3"
)
func setupNatsTest(t *testing.T) *NatsAPI {
repository.ResetConnection()
const testconfig = `{
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"apiAllowedIPs": [
"*"
]
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2m"
}
}
}`
const testclusterJSON = `{
"name": "testcluster",
"subClusters": [
{
"name": "sc1",
"nodes": "host123,host124,host125",
"processorType": "Intel Core i7-4770",
"socketsPerNode": 1,
"coresPerSocket": 4,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"topology": {
"node": [0, 1, 2, 3, 4, 5, 6, 7],
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
"memoryDomain": [[0, 1, 2, 3, 4, 5, 6, 7]],
"die": [[0, 1, 2, 3, 4, 5, 6, 7]],
"core": [[0], [1], [2], [3], [4], [5], [6], [7]]
}
}
],
"metricConfig": [
{
"name": "load_one",
"unit": { "base": ""},
"scope": "node",
"timestep": 60,
"aggregation": "avg",
"peak": 8,
"normal": 0,
"caution": 0,
"alert": 0
}
]
}`
cclog.Init("info", true)
tmpdir := t.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
if err := os.Mkdir(jobarchive, 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
t.Fatal(err)
}
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
t.Fatal(err)
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
cfgFilePath := filepath.Join(tmpdir, "config.json")
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
t.Fatal(err)
}
ccconf.Init(cfgFilePath)
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
config.Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
repository.Connect("sqlite3", dbfilepath)
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
t.Fatal(err)
}
// metricstore initialization removed - it's initialized via callback in tests
archiver.Start(repository.GetJobRepository(), context.Background())
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
auth.Init(&cfg)
} else {
cclog.Warn("Authentication disabled due to missing configuration")
auth.Init(nil)
}
graph.Init()
return NewNatsAPI()
}
func cleanupNatsTest() {
if err := archiver.Shutdown(5 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
}
}
func TestNatsHandleStartJob(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
payload string
expectError bool
validateJob func(t *testing.T, job *schema.Job)
shouldFindJob bool
}{
{
name: "valid job start",
payload: `{
"jobId": 1001,
"user": "testuser1",
"project": "testproj1",
"cluster": "testcluster",
"partition": "main",
"walltime": 7200,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
}
],
"startTime": 1234567890
}`,
expectError: false,
shouldFindJob: true,
validateJob: func(t *testing.T, job *schema.Job) {
if job.JobID != 1001 {
t.Errorf("expected JobID 1001, got %d", job.JobID)
}
if job.User != "testuser1" {
t.Errorf("expected user testuser1, got %s", job.User)
}
if job.State != schema.JobStateRunning {
t.Errorf("expected state running, got %s", job.State)
}
},
},
{
name: "invalid JSON",
payload: `{
"jobId": "not a number",
"user": "testuser2"
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "missing required fields",
payload: `{
"jobId": 1002
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "job with unknown fields (should fail due to DisallowUnknownFields)",
payload: `{
"jobId": 1003,
"user": "testuser3",
"project": "testproj3",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"unknownField": "should cause error",
"startTime": 1234567900
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "job with tags",
payload: `{
"jobId": 1004,
"user": "testuser4",
"project": "testproj4",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"tags": [
{
"type": "test",
"name": "testtag",
"scope": "testuser4"
}
],
"startTime": 1234567910
}`,
expectError: false,
shouldFindJob: true,
validateJob: func(t *testing.T, job *schema.Job) {
if job.JobID != 1004 {
t.Errorf("expected JobID 1004, got %d", job.JobID)
}
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleStartJob(tt.payload)
natsAPI.JobRepository.SyncJobs()
// Allow some time for async operations
time.Sleep(100 * time.Millisecond)
if tt.shouldFindJob {
// Extract jobId from payload
var payloadMap map[string]any
json.Unmarshal([]byte(tt.payload), &payloadMap)
jobID := int64(payloadMap["jobId"].(float64))
cluster := payloadMap["cluster"].(string)
startTime := int64(payloadMap["startTime"].(float64))
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, &startTime)
if err != nil {
if !tt.expectError {
t.Fatalf("expected to find job, but got error: %v", err)
}
return
}
if tt.validateJob != nil {
tt.validateJob(t, job)
}
}
})
}
}
func TestNatsHandleStopJob(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
// First, create a running job
startPayload := `{
"jobId": 2001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
}
],
"startTime": 1234567890
}`
natsAPI.handleStartJob(startPayload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
tests := []struct {
name string
payload string
expectError bool
validateJob func(t *testing.T, job *schema.Job)
setupJobFunc func() // Optional: create specific test job
}{
{
name: "valid job stop - completed",
payload: `{
"jobId": 2001,
"cluster": "testcluster",
"startTime": 1234567890,
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: false,
validateJob: func(t *testing.T, job *schema.Job) {
if job.State != schema.JobStateCompleted {
t.Errorf("expected state completed, got %s", job.State)
}
expectedDuration := int32(1234571490 - 1234567890)
if job.Duration != expectedDuration {
t.Errorf("expected duration %d, got %d", expectedDuration, job.Duration)
}
},
},
{
name: "valid job stop - failed",
setupJobFunc: func() {
startPayloadFailed := `{
"jobId": 2002,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567900
}`
natsAPI.handleStartJob(startPayloadFailed)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2002,
"cluster": "testcluster",
"startTime": 1234567900,
"jobState": "failed",
"stopTime": 1234569900
}`,
expectError: false,
validateJob: func(t *testing.T, job *schema.Job) {
if job.State != schema.JobStateFailed {
t.Errorf("expected state failed, got %s", job.State)
}
},
},
{
name: "invalid JSON",
payload: `{
"jobId": "not a number"
}`,
expectError: true,
},
{
name: "missing jobId",
payload: `{
"cluster": "testcluster",
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: true,
},
{
name: "invalid job state",
setupJobFunc: func() {
startPayloadInvalid := `{
"jobId": 2003,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1]
}
],
"startTime": 1234567910
}`
natsAPI.handleStartJob(startPayloadInvalid)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2003,
"cluster": "testcluster",
"startTime": 1234567910,
"jobState": "invalid_state",
"stopTime": 1234571510
}`,
expectError: true,
},
{
name: "stopTime before startTime",
setupJobFunc: func() {
startPayloadTime := `{
"jobId": 2004,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0]
}
],
"startTime": 1234567920
}`
natsAPI.handleStartJob(startPayloadTime)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2004,
"cluster": "testcluster",
"startTime": 1234567920,
"jobState": "completed",
"stopTime": 1234567900
}`,
expectError: true,
},
{
name: "job not found",
payload: `{
"jobId": 99999,
"cluster": "testcluster",
"startTime": 1234567890,
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: true,
},
}
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
},
},
}
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.setupJobFunc != nil {
tt.setupJobFunc()
}
natsAPI.handleStopJob(tt.payload)
// Allow some time for async operations
time.Sleep(100 * time.Millisecond)
if !tt.expectError && tt.validateJob != nil {
// Extract job details from payload
var payloadMap map[string]any
json.Unmarshal([]byte(tt.payload), &payloadMap)
jobID := int64(payloadMap["jobId"].(float64))
cluster := payloadMap["cluster"].(string)
var startTime *int64
if st, ok := payloadMap["startTime"]; ok {
t := int64(st.(float64))
startTime = &t
}
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, startTime)
if err != nil {
t.Fatalf("expected to find job, but got error: %v", err)
}
tt.validateJob(t, job)
}
})
}
}
func TestNatsHandleNodeState(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
validateFn func(t *testing.T)
}{
{
name: "valid node state update",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"allocated\"],\"cpusAllocated\":8,\"memoryAllocated\":16384,\"gpusAllocated\":0,\"jobsRunning\":1}]}" 1234567890000000000`),
expectError: false,
validateFn: func(t *testing.T) {
// In a full test, we would verify the node state was updated in the database
// For now, just ensure no error occurred
},
},
{
name: "multiple nodes",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"host124\",\"states\":[\"allocated\"],\"cpusAllocated\":4,\"memoryAllocated\":8192,\"gpusAllocated\":1,\"jobsRunning\":1}]}" 1234567890000000000`),
expectError: false,
},
{
name: "invalid JSON in event field",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":\"not an array\"}" 1234567890000000000`),
expectError: true,
},
{
name: "empty nodes array",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1234567890000000000`),
expectError: false, // Empty array should not cause error
},
{
name: "invalid line protocol format",
data: []byte(`invalid line protocol format`),
expectError: true,
},
{
name: "empty data",
data: []byte(``),
expectError: false, // Should be handled gracefully with warning
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleNodeState("test.subject", tt.data)
// Allow some time for async operations
time.Sleep(50 * time.Millisecond)
if tt.validateFn != nil {
tt.validateFn(t)
}
})
}
}
func TestNatsProcessJobEvent(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
msgStartJob, err := lp.NewMessage(
"job",
map[string]string{"function": "start_job"},
nil,
map[string]any{
"event": `{
"jobId": 3001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567890
}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
msgMissingTag, err := lp.NewMessage(
"job",
map[string]string{},
nil,
map[string]any{
"event": `{}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
msgUnknownFunc, err := lp.NewMessage(
"job",
map[string]string{"function": "unknown_function"},
nil,
map[string]any{
"event": `{}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
tests := []struct {
name string
message lp.CCMessage
expectError bool
}{
{
name: "start_job function",
message: msgStartJob,
expectError: false,
},
{
name: "missing function tag",
message: msgMissingTag,
expectError: true,
},
{
name: "unknown function",
message: msgUnknownFunc,
expectError: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.processJobEvent(tt.message)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleJobEvent(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
}{
{
name: "valid influx line protocol",
data: []byte(`job,function=start_job event="{\"jobId\":4001,\"user\":\"testuser\",\"project\":\"testproj\",\"cluster\":\"testcluster\",\"partition\":\"main\",\"walltime\":3600,\"numNodes\":1,\"numHwthreads\":8,\"numAcc\":0,\"shared\":\"none\",\"monitoringStatus\":1,\"smt\":1,\"resources\":[{\"hostname\":\"host123\",\"hwthreads\":[0,1,2,3]}],\"startTime\":1234567890}" 1234567890000000000`),
expectError: false,
},
{
name: "invalid influx line protocol",
data: []byte(`invalid line protocol format`),
expectError: true,
},
{
name: "empty data",
data: []byte(``),
expectError: false, // Decoder should handle empty input gracefully
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// HandleJobEvent doesn't return errors, it logs them
// We're just ensuring it doesn't panic
natsAPI.handleJobEvent("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleJobEventEdgeCases(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
description string
}{
{
name: "non-event message (metric data)",
data: []byte(`job,function=start_job value=123.45 1234567890000000000`),
expectError: false,
description: "Should skip non-event messages gracefully",
},
{
name: "wrong measurement name",
data: []byte(`wrongmeasurement,function=start_job event="{}" 1234567890000000000`),
expectError: false,
description: "Should warn about unexpected measurement but not fail",
},
{
name: "missing event field",
data: []byte(`job,function=start_job other_field="value" 1234567890000000000`),
expectError: true,
description: "Should error when event field is missing",
},
{
name: "multiple measurements in one message",
data: []byte("job,function=start_job event=\"{}\" 1234567890000000000\njob,function=stop_job event=\"{}\" 1234567890000000000"),
expectError: false,
description: "Should process multiple lines",
},
{
name: "escaped quotes in JSON payload",
data: []byte(`job,function=start_job event="{\"jobId\":6001,\"user\":\"test\\\"user\",\"cluster\":\"test\"}" 1234567890000000000`),
expectError: true,
description: "Should handle escaped quotes (though JSON parsing may fail)",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleJobEvent("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleNodeStateEdgeCases(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
description string
}{
{
name: "missing cluster field in JSON",
data: []byte(`nodestate event="{\"nodes\":[]}" 1234567890000000000`),
expectError: true,
description: "Should fail when cluster is missing",
},
{
name: "malformed JSON with unescaped quotes",
data: []byte(`nodestate event="{\"cluster\":\"test"cluster\",\"nodes\":[]}" 1234567890000000000`),
expectError: true,
description: "Should fail on malformed JSON",
},
{
name: "unicode characters in hostname",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host-ñ123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
expectError: false,
description: "Should handle unicode characters",
},
{
name: "very large node count",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node1\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node2\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node3\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
expectError: false,
description: "Should handle multiple nodes efficiently",
},
{
name: "timestamp in past",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1000000000000000000`),
expectError: false,
description: "Should accept any valid timestamp",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleNodeState("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleStartJobDuplicatePrevention(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
// Start a job
payload := `{
"jobId": 5001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567890
}`
natsAPI.handleStartJob(payload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
// Try to start the same job again (within 24 hours)
duplicatePayload := `{
"jobId": 5001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567900
}`
natsAPI.handleStartJob(duplicatePayload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
// Verify only one job exists
jobID := int64(5001)
cluster := "testcluster"
jobs, err := natsAPI.JobRepository.FindAll(&jobID, &cluster, nil)
if err != nil && err != sql.ErrNoRows {
t.Fatalf("unexpected error: %v", err)
}
if len(jobs) != 1 {
t.Errorf("expected 1 job, got %d", len(jobs))
}
}

View File

@@ -12,7 +12,7 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type UpdateNodeStatesRequest struct {
@@ -47,7 +47,7 @@ func determineState(states []string) schema.SchedulerState {
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
// @success 200 {object} api.DefaultApiResponse "Success message"
// @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"

View File

@@ -22,9 +22,9 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/gorilla/mux"
)
@@ -48,6 +48,7 @@ import (
const (
noticeFilePath = "./var/notice.txt"
noticeFilePerms = 0o644
maxNoticeLength = 10000 // Maximum allowed notice content length in characters
)
type RestAPI struct {
@@ -61,6 +62,7 @@ type RestAPI struct {
RepositoryMutex sync.Mutex
}
// New creates and initializes a new RestAPI instance with configured dependencies.
func New() *RestAPI {
return &RestAPI{
JobRepository: repository.GetJobRepository(),
@@ -69,6 +71,8 @@ func New() *RestAPI {
}
}
// MountAPIRoutes registers REST API endpoints for job and cluster management.
// These routes use JWT token authentication via the X-Auth-Token header.
func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
r.StrictSlash(true)
// REST API Uses TokenAuth
@@ -103,6 +107,8 @@ func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
}
}
// MountUserAPIRoutes registers user-accessible REST API endpoints.
// These are limited endpoints for regular users with JWT token authentication.
func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) {
r.StrictSlash(true)
// REST API Uses TokenAuth
@@ -112,6 +118,8 @@ func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) {
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
}
// MountMetricStoreAPIRoutes registers metric storage API endpoints.
// These endpoints handle metric data ingestion and health checks with JWT token authentication.
func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
// REST API Uses TokenAuth
// Note: StrictSlash handles trailing slash variations automatically
@@ -126,6 +134,8 @@ func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
r.HandleFunc("/api/healthcheck/", metricsHealth).Methods(http.MethodGet)
}
// MountConfigAPIRoutes registers configuration and user management endpoints.
// These routes use session-based authentication and require admin privileges.
func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) {
r.StrictSlash(true)
// Settings Frontend Uses SessionAuth
@@ -139,6 +149,8 @@ func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) {
}
}
// MountFrontendAPIRoutes registers frontend-specific API endpoints.
// These routes support JWT generation and user configuration updates with session authentication.
func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) {
r.StrictSlash(true)
// Settings Frontend Uses SessionAuth
@@ -160,6 +172,8 @@ type DefaultAPIResponse struct {
Message string `json:"msg"`
}
// handleError writes a standardized JSON error response with the given status code.
// It logs the error at WARN level and ensures proper Content-Type headers are set.
func handleError(err error, statusCode int, rw http.ResponseWriter) {
cclog.Warnf("REST ERROR : %s", err.Error())
rw.Header().Add("Content-Type", "application/json")
@@ -172,15 +186,38 @@ func handleError(err error, statusCode int, rw http.ResponseWriter) {
}
}
// decode reads JSON from r into val with strict validation that rejects unknown fields.
func decode(r io.Reader, val any) error {
dec := json.NewDecoder(r)
dec.DisallowUnknownFields()
return dec.Decode(val)
}
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
// validatePathComponent checks if a path component contains potentially malicious patterns
// that could be used for path traversal attacks. Returns an error if validation fails.
func validatePathComponent(component, componentName string) error {
if strings.Contains(component, "..") ||
strings.Contains(component, "/") ||
strings.Contains(component, "\\") {
return fmt.Errorf("invalid %s", componentName)
}
return nil
}
// editNotice godoc
// @summary Update system notice
// @tags Config
// @description Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.
// @accept mpfd
// @produce plain
// @param new-content formData string true "New notice content (max 10000 characters)"
// @success 200 {string} string "Update Notice Content Success"
// @failure 400 {object} ErrorResponse "Bad Request"
// @failure 403 {object} ErrorResponse "Forbidden"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /notice/ [post]
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to update the notice.txt file"), http.StatusForbidden, rw)
return
@@ -189,9 +226,8 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
// Get Value
newContent := r.FormValue("new-content")
// Validate content length to prevent DoS
if len(newContent) > 10000 {
handleError(fmt.Errorf("notice content exceeds maximum length of 10000 characters"), http.StatusBadRequest, rw)
if len(newContent) > maxNoticeLength {
handleError(fmt.Errorf("notice content exceeds maximum length of %d characters", maxNoticeLength), http.StatusBadRequest, rw)
return
}
@@ -203,7 +239,9 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
handleError(fmt.Errorf("creating notice file failed: %w", err), http.StatusInternalServerError, rw)
return
}
ntxt.Close()
if err := ntxt.Close(); err != nil {
cclog.Warnf("Failed to close notice file: %v", err)
}
}
if err := os.WriteFile(noticeFilePath, []byte(newContent), noticeFilePerms); err != nil {
@@ -213,13 +251,30 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain")
rw.WriteHeader(http.StatusOK)
var msg []byte
if newContent != "" {
rw.Write([]byte("Update Notice Content Success"))
msg = []byte("Update Notice Content Success")
} else {
rw.Write([]byte("Empty Notice Content Success"))
msg = []byte("Empty Notice Content Success")
}
if _, err := rw.Write(msg); err != nil {
cclog.Errorf("Failed to write response: %v", err)
}
}
// getJWT godoc
// @summary Generate JWT token
// @tags Frontend
// @description Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.
// @accept mpfd
// @produce plain
// @param username formData string true "Username to generate JWT for"
// @success 200 {string} string "JWT token"
// @failure 403 {object} ErrorResponse "Forbidden"
// @failure 404 {object} ErrorResponse "User Not Found"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /jwt/ [get]
func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain")
username := r.FormValue("username")
@@ -244,12 +299,22 @@ func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
}
rw.WriteHeader(http.StatusOK)
rw.Write([]byte(jwt))
if _, err := rw.Write([]byte(jwt)); err != nil {
cclog.Errorf("Failed to write JWT response: %v", err)
}
}
// getRoles godoc
// @summary Get available roles
// @tags Config
// @description Returns a list of valid user roles. Only admins are allowed.
// @produce json
// @success 200 {array} string "List of role names"
// @failure 403 {object} ErrorResponse "Forbidden"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /roles/ [get]
func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
user := repository.GetUserFromContext(r.Context())
if !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to fetch a list of roles"), http.StatusForbidden, rw)
@@ -268,6 +333,18 @@ func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
}
}
// updateConfiguration godoc
// @summary Update user configuration
// @tags Frontend
// @description Updates a user's configuration key-value pair.
// @accept mpfd
// @produce plain
// @param key formData string true "Configuration key"
// @param value formData string true "Configuration value"
// @success 200 {string} string "success"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /configuration/ [post]
func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain")
key, value := r.FormValue("key"), r.FormValue("value")
@@ -278,9 +355,25 @@ func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request)
}
rw.WriteHeader(http.StatusOK)
rw.Write([]byte("success"))
if _, err := rw.Write([]byte("success")); err != nil {
cclog.Errorf("Failed to write response: %v", err)
}
}
// putMachineState godoc
// @summary Store machine state
// @tags Machine State
// @description Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
// @accept json
// @produce plain
// @param cluster path string true "Cluster name"
// @param host path string true "Host name"
// @success 201 "Created"
// @failure 400 {object} ErrorResponse "Bad Request"
// @failure 404 {object} ErrorResponse "Machine state not enabled"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /machine_state/{cluster}/{host} [put]
func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" {
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
@@ -291,13 +384,12 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
cluster := vars["cluster"]
host := vars["host"]
// Validate cluster and host to prevent path traversal attacks
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
if err := validatePathComponent(cluster, "cluster name"); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
if err := validatePathComponent(host, "host name"); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
@@ -323,6 +415,18 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
rw.WriteHeader(http.StatusCreated)
}
// getMachineState godoc
// @summary Retrieve machine state
// @tags Machine State
// @description Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
// @produce json
// @param cluster path string true "Cluster name"
// @param host path string true "Host name"
// @success 200 {object} object "Machine state JSON data"
// @failure 400 {object} ErrorResponse "Bad Request"
// @failure 404 {object} ErrorResponse "Machine state not enabled or file not found"
// @security ApiKeyAuth
// @router /machine_state/{cluster}/{host} [get]
func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" {
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
@@ -333,13 +437,12 @@ func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) {
cluster := vars["cluster"]
host := vars["host"]
// Validate cluster and host to prevent path traversal attacks
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
if err := validatePathComponent(cluster, "cluster name"); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
if err := validatePathComponent(host, "host name"); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}

View File

@@ -11,8 +11,8 @@ import (
"net/http"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/gorilla/mux"
)
@@ -31,7 +31,7 @@ type APIReturnedUser struct {
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
// @success 200 {array} api.ApiReturnedUser "List of users returned successfully"
// @success 200 {array} api.APIReturnedUser "List of users returned successfully"
// @failure 400 {string} string "Bad Request"
// @failure 401 {string} string "Unauthorized"
// @failure 403 {string} string "Forbidden"

View File

@@ -106,7 +106,7 @@ Data is archived at the highest available resolution (typically 60s intervals).
```go
// In archiver.go ArchiveJob() function
jobData, err := metricdispatcher.LoadData(job, allMetrics, scopes, ctx, 300)
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 300)
// 0 = highest resolution
// 300 = 5-minute resolution
```
@@ -185,6 +185,6 @@ Internal state is protected by:
## Dependencies
- `internal/repository`: Database operations for job metadata
- `internal/metricdispatcher`: Loading metric data from various backends
- `internal/metricdispatch`: Loading metric data from various backends
- `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
- `cc-lib/schema`: Job and metric data structures

View File

@@ -54,8 +54,8 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
sq "github.com/Masterminds/squirrel"
)

View File

@@ -10,10 +10,10 @@ import (
"math"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// ArchiveJob archives a completed job's metric data to the configured archive backend.
@@ -60,7 +60,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
scopes = append(scopes, schema.MetricScopeAccelerator)
}
jobData, err := metricdispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
if err != nil {
cclog.Error("Error wile loading job data for archiving")
return nil, err

View File

@@ -25,9 +25,9 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/gorilla/sessions"
)
@@ -616,9 +616,9 @@ func securedCheck(user *schema.User, r *http.Request) error {
}
// If SplitHostPort fails, IPAddress is already just a host (no port)
// If nothing declared in config: deny all request to this api endpoint
// If nothing declared in config: Continue
if len(config.Keys.APIAllowedIPs) == 0 {
return fmt.Errorf("missing configuration key ApiAllowedIPs")
return nil
}
// If wildcard declared in config: Continue
if config.Keys.APIAllowedIPs[0] == "*" {

View File

@@ -14,8 +14,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)

View File

@@ -12,8 +12,8 @@ import (
"net/http"
"os"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)

View File

@@ -11,8 +11,8 @@ import (
"fmt"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)

View File

@@ -8,7 +8,7 @@ package auth
import (
"testing"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
@@ -237,7 +237,6 @@ func TestGetUserFromJWT_NoValidation(t *testing.T) {
}
user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}

View File

@@ -13,8 +13,8 @@ import (
"os"
"strings"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)

View File

@@ -13,8 +13,8 @@ import (
"strings"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-ldap/ldap/v3"
)

View File

@@ -9,8 +9,8 @@ import (
"fmt"
"net/http"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"golang.org/x/crypto/bcrypt"
)

View File

@@ -15,8 +15,8 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/coreos/go-oidc/v3/oidc"
"github.com/gorilla/mux"
"golang.org/x/oauth2"

View File

@@ -11,8 +11,8 @@ import (
"encoding/json"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/resampler"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/resampler"
)
type ProgramConfig struct {
@@ -78,9 +78,6 @@ type ProgramConfig struct {
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"resampling"`
// Global upstream metric repository configuration for metric pull workers
UpstreamMetricRepository *json.RawMessage `json:"upstreamMetricRepository,omitempty"`
}
type ResampleConfig struct {
@@ -93,8 +90,7 @@ type ResampleConfig struct {
}
type NATSConfig struct {
SubjectJobStart string `json:"subjectJobStart"`
SubjectJobStop string `json:"subjectJobStop"`
SubjectJobEvent string `json:"subjectJobEvent"`
SubjectNodeState string `json:"subjectNodeState"`
}
@@ -115,13 +111,6 @@ type FilterRanges struct {
StartTime *TimeRange `json:"startTime"`
}
type ClusterConfig struct {
Name string `json:"name"`
FilterRanges *FilterRanges `json:"filterRanges"`
}
var Clusters []*ClusterConfig
var Keys ProgramConfig = ProgramConfig{
Addr: "localhost:8080",
DisableAuthentication: false,
@@ -135,7 +124,7 @@ var Keys ProgramConfig = ProgramConfig{
ShortRunningJobsDuration: 5 * 60,
}
func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
func Init(mainConfig json.RawMessage) {
Validate(configSchema, mainConfig)
dec := json.NewDecoder(bytes.NewReader(mainConfig))
dec.DisallowUnknownFields()
@@ -143,17 +132,6 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
}
Validate(clustersSchema, clusterConfig)
dec = json.NewDecoder(bytes.NewReader(clusterConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&Clusters); err != nil {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
}
if len(Clusters) < 1 {
cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
}
if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
}

View File

@@ -8,19 +8,15 @@ package config
import (
"testing"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
func TestInit(t *testing.T) {
fp := "../../configs/config.json"
ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
@@ -34,11 +30,7 @@ func TestInitMinimal(t *testing.T) {
fp := "../../configs/config-demo.json"
ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}

View File

@@ -6,7 +6,7 @@
package config
var configSchema = `
{
{
"type": "object",
"properties": {
"addr": {
@@ -120,103 +120,20 @@ var configSchema = `
},
"required": ["trigger", "resolutions"]
},
"upstreamMetricRepository": {
"description": "Global upstream metric repository configuration for metric pull workers",
"apiSubjects": {
"description": "NATS subjects configuration for subscribing to job and node events.",
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
},
"url": {
"subjectJobEvent": {
"description": "NATS subject for job events (start_job, stop_job)",
"type": "string"
},
"token": {
"subjectNodeState": {
"description": "NATS subject for node state updates",
"type": "string"
}
},
"required": ["kind"]
"required": ["subjectJobEvent", "subjectNodeState"]
}
},
"required": ["apiAllowedIPs"]
}`
var clustersSchema = `
{
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "The name of the cluster.",
"type": "string"
},
"metricDataRepository": {
"description": "Type of the metric data repository for this cluster",
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
},
"url": {
"type": "string"
},
"token": {
"type": "string"
}
},
"required": ["kind"]
},
"filterRanges": {
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
"type": "object",
"properties": {
"numNodes": {
"description": "UI slider range for number of nodes",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": ["from", "to"]
},
"duration": {
"description": "UI slider range for duration",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": ["from", "to"]
},
"startTime": {
"description": "UI slider range for start time",
"type": "object",
"properties": {
"from": {
"type": "string",
"format": "date-time"
},
"to": {
"type": "null"
}
},
"required": ["from", "to"]
}
},
"required": ["numNodes", "duration", "startTime"]
}
},
"required": ["name", "filterRanges"],
"minItems": 1
}
}`
}
}`

View File

@@ -8,7 +8,7 @@ package config
import (
"encoding/json"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/santhosh-tekuri/jsonschema/v5"
)

File diff suppressed because it is too large Load Diff

View File

@@ -10,7 +10,7 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type ClusterMetricWithName struct {
@@ -82,6 +82,7 @@ type JobFilter struct {
State []schema.JobState `json:"state,omitempty"`
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
Shared *string `json:"shared,omitempty"`
Schedule *string `json:"schedule,omitempty"`
Node *StringInput `json:"node,omitempty"`
}

View File

@@ -4,7 +4,7 @@ import (
"sync"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/jmoiron/sqlx"
)

View File

@@ -3,7 +3,7 @@ package graph
// This file will be automatically regenerated based on the schema, any resolver
// implementations
// will be copied through when generating and any unknown code will be moved to the end.
// Code generated by github.com/99designs/gqlgen version v0.17.84
// Code generated by github.com/99designs/gqlgen version v0.17.85
import (
"context"
@@ -19,11 +19,11 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// Partitions is the resolver for the partitions field.
@@ -283,7 +283,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
// Test Access: Admins && Admin Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
// Remove from DB
if err = r.Repo.RemoveTagById(tid); err != nil {
if err = r.Repo.RemoveTagByID(tid); err != nil {
cclog.Warn("Error while removing tag")
return nil, err
} else {
@@ -484,7 +484,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
return nil, err
}
data, err := metricdispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
data, err := metricdispatch.LoadData(job, metrics, scopes, ctx, *resolution)
if err != nil {
cclog.Warn("Error while loading job data")
return nil, err
@@ -512,7 +512,7 @@ func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []strin
return nil, err
}
data, err := metricdispatcher.LoadJobStats(job, metrics, ctx)
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("Error while loading jobStats data for job id %s", id)
return nil, err
@@ -537,7 +537,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
return nil, err
}
data, err := metricdispatcher.LoadScopedJobStats(job, metrics, scopes, ctx)
data, err := metricdispatch.LoadScopedJobStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
return nil, err
@@ -590,21 +590,24 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
// Users can decide in frontend to use continuous scroll, even if app-default is paging!
// Skip if page.ItemsPerPage == -1 ("Load All" -> No Next Page required, Status Dashboards)
/*
Example Page 4 @ 10 IpP : Does item 41 exist?
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
*/
nextPage := &model.PageRequest{
ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1),
hasNextPage := false
if page.ItemsPerPage != -1 {
nextPage := &model.PageRequest{
ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1),
}
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
if err != nil {
cclog.Warn("Error while querying next jobs")
return nil, err
}
hasNextPage = len(nextJobs) == 1
}
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
if err != nil {
cclog.Warn("Error while querying next jobs")
return nil, err
}
hasNextPage := len(nextJobs) == 1
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
}
@@ -702,7 +705,7 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job
res := []*model.JobStats{}
for _, job := range jobs {
data, err := metricdispatcher.LoadJobStats(job, metrics, ctx)
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
continue
@@ -753,13 +756,19 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return nil, errors.New("you need to be administrator or support staff for this query")
}
defaultMetrics := make([]string, 0)
for _, mc := range archive.GetCluster(cluster).MetricConfig {
defaultMetrics = append(defaultMetrics, mc.Name)
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
metrics = defaultMetrics
} else {
metrics = slices.DeleteFunc(metrics, func(metric string) bool {
return !slices.Contains(defaultMetrics, metric) // Remove undefined metrics.
})
}
data, err := metricdispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
data, err := metricdispatch.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
cclog.Warn("error while loading node data")
return nil, err
@@ -825,7 +834,7 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
}
}
data, err := metricdispatcher.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
if err != nil {
cclog.Warn("error while loading node data (Resolver.NodeMetricsList")
return nil, err
@@ -880,7 +889,7 @@ func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metr
// 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow
scopes := []schema.MetricScope{"node"}
data, err := metricdispatcher.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
data, err := metricdispatch.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
if err != nil {
cclog.Warn("error while loading node data")
return nil, err
@@ -972,12 +981,10 @@ func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
// SubCluster returns generated.SubClusterResolver implementation.
func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} }
type (
clusterResolver struct{ *Resolver }
jobResolver struct{ *Resolver }
metricValueResolver struct{ *Resolver }
mutationResolver struct{ *Resolver }
nodeResolver struct{ *Resolver }
queryResolver struct{ *Resolver }
subClusterResolver struct{ *Resolver }
)
type clusterResolver struct{ *Resolver }
type jobResolver struct{ *Resolver }
type metricValueResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver }
type nodeResolver struct{ *Resolver }
type queryResolver struct{ *Resolver }
type subClusterResolver struct{ *Resolver }

View File

@@ -13,9 +13,9 @@ import (
"github.com/99designs/gqlgen/graphql"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
const MAX_JOBS_FOR_ANALYSIS = 500
@@ -55,7 +55,7 @@ func (r *queryResolver) rooflineHeatmap(
// resolution = max(resolution, mc.Timestep)
// }
jobdata, err := metricdispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
if err != nil {
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
return nil, err
@@ -128,7 +128,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
continue
}
if err := metricdispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil {
cclog.Error("Error while loading averages for footprint")
return nil, err
}

View File

@@ -2,6 +2,7 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
import (
@@ -14,8 +15,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.

View File

@@ -16,8 +16,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
// copyFile copies a file from source path to destination path.
@@ -56,33 +56,8 @@ func setup(t *testing.T) *repository.JobRepository {
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"clusters": [
{
"name": "testcluster",
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
},
{
"name": "fritz",
"filterRanges": {
"numNodes": { "from": 1, "to": 944 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
},
{
"name": "taurus",
"filterRanges": {
"numNodes": { "from": 1, "to": 4000 },
"duration": { "from": 0, "to": 604800 },
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
}
}
]}`
}
}`
cclog.Init("info", true)
tmpdir := t.TempDir()
@@ -118,11 +93,7 @@ func setup(t *testing.T) *repository.JobRepository {
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
config.Init(cfg, clustercfg)
} else {
t.Fatal("Cluster configuration must be present")
}
config.Init(cfg)
} else {
t.Fatal("Main configuration must be present")
}

View File

@@ -22,8 +22,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
const (

View File

@@ -2,12 +2,13 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
import (
"math"
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
)
// getNormalizationFactor calculates the scaling factor needed to normalize a value

View File

@@ -8,7 +8,7 @@ import (
"fmt"
"testing"
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
)
// TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix.

View File

@@ -1,190 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"errors"
"sync"
"github.com/ClusterCockpit/cc-lib/schema"
)
// BufferCap is the default buffer capacity.
// buffer.data will only ever grow up to its capacity and a new link
// in the buffer chain will be created if needed so that no copying
// of data or reallocation needs to happen on writes.
const BufferCap int = DefaultBufferCapacity
var bufferPool sync.Pool = sync.Pool{
New: func() any {
return &buffer{
data: make([]schema.Float, 0, BufferCap),
}
},
}
var (
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
)
// Each metric on each level has it's own buffer.
// This is where the actual values go.
// If `cap(data)` is reached, a new buffer is created and
// becomes the new head of a buffer list.
type buffer struct {
prev *buffer
next *buffer
data []schema.Float
frequency int64
start int64
archived bool
closed bool
}
func newBuffer(ts, freq int64) *buffer {
b := bufferPool.Get().(*buffer)
b.frequency = freq
b.start = ts - (freq / 2)
b.prev = nil
b.next = nil
b.archived = false
b.closed = false
b.data = b.data[:0]
return b
}
// If a new buffer was created, the new head is returnd.
// Otherwise, the existing buffer is returnd.
// Normaly, only "newer" data should be written, but if the value would
// end up in the same buffer anyways it is allowed.
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
if ts < b.start {
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
}
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
idx := int((ts - b.start) / b.frequency)
if idx >= cap(b.data) {
newbuf := newBuffer(ts, b.frequency)
newbuf.prev = b
b.next = newbuf
b = newbuf
idx = 0
}
// Overwriting value or writing value from past
if idx < len(b.data) {
b.data[idx] = value
return b, nil
}
// Fill up unwritten slots with NaN
for i := len(b.data); i < idx; i++ {
b.data = append(b.data, schema.NaN)
}
b.data = append(b.data, value)
return b, nil
}
func (b *buffer) end() int64 {
return b.firstWrite() + int64(len(b.data))*b.frequency
}
func (b *buffer) firstWrite() int64 {
return b.start + (b.frequency / 2)
}
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
// Simple linear interpolation is done between the two neighboring cells if possible.
// If values at the start or end are missing, instead of NaN values, the second and thrid
// return values contain the actual `from`/`to`.
// This function goes back the buffer chain if `from` is older than the currents buffer start.
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
// If `data` is not long enough to hold all values, this function will panic!
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
if from < b.firstWrite() {
if b.prev != nil {
return b.prev.read(from, to, data)
}
from = b.firstWrite()
}
i := 0
t := from
for ; t < to; t += b.frequency {
idx := int((t - b.start) / b.frequency)
if idx >= cap(b.data) {
if b.next == nil {
break
}
b = b.next
idx = 0
}
if idx >= len(b.data) {
if b.next == nil || to <= b.next.start {
break
}
data[i] += schema.NaN
} else if t < b.start {
data[i] += schema.NaN
} else {
data[i] += b.data[idx]
}
i++
}
return data[:i], from, t, nil
}
// Returns true if this buffer needs to be freed.
func (b *buffer) free(t int64) (delme bool, n int) {
if b.prev != nil {
delme, m := b.prev.free(t)
n += m
if delme {
b.prev.next = nil
if cap(b.prev.data) == BufferCap {
bufferPool.Put(b.prev)
}
b.prev = nil
}
}
end := b.end()
if end < t {
return true, n + 1
}
return false, n
}
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
if b == nil {
return nil
}
if err := b.prev.iterFromTo(from, to, callback); err != nil {
return err
}
if from <= b.end() && b.start <= to {
return callback(b)
}
return nil
}
func (b *buffer) count() int64 {
res := int64(len(b.data))
if b.prev != nil {
res += b.prev.count()
}
return res
}

View File

@@ -1,115 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"fmt"
"time"
)
const (
DefaultMaxWorkers = 10
DefaultBufferCapacity = 512
DefaultGCTriggerInterval = 100
DefaultAvroWorkers = 4
DefaultCheckpointBufferMin = 3
DefaultAvroCheckpointInterval = time.Minute
)
type MetricStoreConfig struct {
// Number of concurrent workers for checkpoint and archive operations.
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
NumWorkers int `json:"num-workers"`
Checkpoints struct {
FileFormat string `json:"file-format"`
Interval string `json:"interval"`
RootDir string `json:"directory"`
Restore string `json:"restore"`
} `json:"checkpoints"`
Debug struct {
DumpToFile string `json:"dump-to-file"`
EnableGops bool `json:"gops"`
} `json:"debug"`
RetentionInMemory string `json:"retention-in-memory"`
Archive struct {
Interval string `json:"interval"`
RootDir string `json:"directory"`
DeleteInstead bool `json:"delete-instead"`
} `json:"archive"`
Subscriptions []struct {
// Channel name
SubscribeTo string `json:"subscribe-to"`
// Allow lines without a cluster tag, use this as default, optional
ClusterTag string `json:"cluster-tag"`
} `json:"subscriptions"`
}
var Keys MetricStoreConfig
// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time!
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota
SumAggregation
AvgAggregation
)
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
switch str {
case "":
return NoAggregation, nil
case "sum":
return SumAggregation, nil
case "avg":
return AvgAggregation, nil
default:
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
}
}
type MetricConfig struct {
// Interval in seconds at which measurements are stored
Frequency int64
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy
// Private, used internally...
offset int
}
var Metrics map[string]MetricConfig
func GetMetricFrequency(metricName string) (int64, error) {
if metric, ok := Metrics[metricName]; ok {
return metric.Frequency, nil
}
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
}
// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency.
// use metric.Name to check if the metric already exists.
// if not, add it to the Metrics map.
func AddMetric(name string, metric MetricConfig) error {
if Metrics == nil {
Metrics = make(map[string]MetricConfig, 0)
}
if existingMetric, ok := Metrics[name]; ok {
if existingMetric.Frequency != metric.Frequency {
if existingMetric.Frequency < metric.Frequency {
existingMetric.Frequency = metric.Frequency
Metrics[name] = existingMetric
}
}
} else {
Metrics[name] = metric
}
return nil
}

View File

@@ -1,95 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
const configSchema = `{
"type": "object",
"description": "Configuration specific to built-in metric-store.",
"properties": {
"checkpoints": {
"description": "Configuration for checkpointing the metrics within metric-store",
"type": "object",
"properties": {
"file-format": {
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
"type": "string"
},
"interval": {
"description": "Interval at which the metrics should be checkpointed.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the checkpointed files should be placed.",
"type": "string"
},
"restore": {
"description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.",
"type": "string"
}
}
},
"archive": {
"description": "Configuration for archiving the already checkpointed files.",
"type": "object",
"properties": {
"interval": {
"description": "Interval at which the checkpointed files should be archived.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the archived files should be placed.",
"type": "string"
}
}
},
"retention-in-memory": {
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
"type": "string"
},
"nats": {
"description": "Configuration for accepting published data through NATS.",
"type": "array",
"items": {
"type": "object",
"properties": {
"address": {
"description": "Address of the NATS server.",
"type": "string"
},
"username": {
"description": "Optional: If configured with username/password method.",
"type": "string"
},
"password": {
"description": "Optional: If configured with username/password method.",
"type": "string"
},
"creds-file-path": {
"description": "Optional: If configured with Credential File method. Path to your NATS cred file.",
"type": "string"
},
"subscriptions": {
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
"type": "array",
"items": {
"type": "object",
"properties": {
"subscribe-to": {
"description": "Channel name",
"type": "string"
},
"cluster-tag": {
"description": "Optional: Allow lines without a cluster tag, use this as default",
"type": "string"
}
}
}
}
}
}
}
}
}`

View File

@@ -1,192 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"sync"
"unsafe"
"github.com/ClusterCockpit/cc-lib/util"
)
// Could also be called "node" as this forms a node in a tree structure.
// Called Level because "node" might be confusing here.
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
// also hold data (in `metrics`).
type Level struct {
children map[string]*Level
metrics []*buffer
lock sync.RWMutex
}
// Find the correct level for the given selector, creating it if
// it does not exist. Example selector in the context of the
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *Level
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok = l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
// The level does not exist, take write lock for unique access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
child = &Level{
metrics: make([]*buffer, nMetrics),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*Level{selector[0]: child}
}
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
func (l *Level) free(t int64) (int, error) {
l.lock.Lock()
defer l.lock.Unlock()
n := 0
for i, b := range l.metrics {
if b != nil {
delme, m := b.free(t)
n += m
if delme {
if cap(b.data) == BufferCap {
bufferPool.Put(b)
}
l.metrics[i] = nil
}
}
}
for _, l := range l.children {
m, err := l.free(t)
n += m
if err != nil {
return n, err
}
}
return n, nil
}
func (l *Level) sizeInBytes() int64 {
l.lock.RLock()
defer l.lock.RUnlock()
size := int64(0)
for _, b := range l.metrics {
if b != nil {
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
}
}
for _, child := range l.children {
size += child.sizeInBytes()
}
return size
}
func (l *Level) findLevel(selector []string) *Level {
if len(selector) == 0 {
return l
}
l.lock.RLock()
defer l.lock.RUnlock()
lvl := l.children[selector[0]]
if lvl == nil {
return nil
}
return lvl.findLevel(selector[1:])
}
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
l.lock.RLock()
defer l.lock.RUnlock()
if len(selector) == 0 {
b := l.metrics[offset]
if b != nil {
return f(b)
}
for _, lvl := range l.children {
err := lvl.findBuffers(nil, offset, f)
if err != nil {
return err
}
}
return nil
}
sel := selector[0]
if len(sel.String) != 0 && l.children != nil {
lvl, ok := l.children[sel.String]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
return nil
}
if sel.Group != nil && l.children != nil {
for _, key := range sel.Group {
lvl, ok := l.children[key]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
}
return nil
}
if sel.Any && l.children != nil {
for _, lvl := range l.children {
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
return err
}
}
return nil
}
return nil
}

View File

@@ -1,429 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package memorystore provides an efficient in-memory time-series metric storage system
// with support for hierarchical data organization, checkpointing, and archiving.
//
// The package organizes metrics in a tree structure (cluster → host → component) and
// provides concurrent read/write access to metric data with configurable aggregation strategies.
// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
// and enforcing retention policies.
//
// Key features:
// - In-memory metric storage with configurable retention
// - Hierarchical data organization (selectors)
// - Concurrent checkpoint/archive workers
// - Support for sum and average aggregation
// - NATS integration for metric ingestion
package memorystore
import (
"bytes"
"context"
"encoding/json"
"errors"
"runtime"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/resampler"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
)
var (
singleton sync.Once
msInstance *MemoryStore
// shutdownFunc stores the context cancellation function created in Init
// and is called during Shutdown to cancel all background goroutines
shutdownFunc context.CancelFunc
)
type Metric struct {
Name string
Value schema.Float
MetricConfig MetricConfig
}
type MemoryStore struct {
Metrics map[string]MetricConfig
root Level
}
func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
startupTime := time.Now()
if rawConfig != nil {
config.Validate(configSchema, rawConfig)
dec := json.NewDecoder(bytes.NewReader(rawConfig))
// dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
}
}
// Set NumWorkers from config or use default
if Keys.NumWorkers <= 0 {
Keys.NumWorkers = min(runtime.NumCPU()/2+1, DefaultMaxWorkers)
}
cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers)
// Helper function to add metric configuration
addMetricConfig := func(mc schema.MetricConfig) {
agg, err := AssignAggregationStrategy(mc.Aggregation)
if err != nil {
cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error())
}
AddMetric(mc.Name, MetricConfig{
Frequency: int64(mc.Timestep),
Aggregation: agg,
})
}
for _, c := range archive.Clusters {
for _, mc := range c.MetricConfig {
addMetricConfig(*mc)
}
for _, sc := range c.SubClusters {
for _, mc := range sc.MetricConfig {
addMetricConfig(mc)
}
}
}
// Pass the config.MetricStoreKeys
InitMetrics(Metrics)
ms := GetMemoryStore()
d, err := time.ParseDuration(Keys.Checkpoints.Restore)
if err != nil {
cclog.Fatal(err)
}
restoreFrom := startupTime.Add(-d)
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
if err != nil {
cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
}
// Try to use less memory by forcing a GC run here and then
// lowering the target percentage. The default of 100 means
// that only once the ratio of new allocations execeds the
// previously active heap, a GC is triggered.
// Forcing a GC here will set the "previously active heap"
// to a minumum.
runtime.GC()
ctx, shutdown := context.WithCancel(context.Background())
wg.Add(4)
Retention(wg, ctx)
Checkpointing(wg, ctx)
Archiving(wg, ctx)
DataStaging(wg, ctx)
// Note: Signal handling has been removed from this function.
// The caller is responsible for handling shutdown signals and calling
// the shutdown() function when appropriate.
// Store the shutdown function for later use by Shutdown()
shutdownFunc = shutdown
err = ReceiveNats(ms, 1, ctx)
if err != nil {
cclog.Fatal(err)
}
}
// InitMetrics creates a new, initialized instance of a MemoryStore.
// Will panic if values in the metric configurations are invalid.
func InitMetrics(metrics map[string]MetricConfig) {
singleton.Do(func() {
offset := 0
for key, cfg := range metrics {
if cfg.Frequency == 0 {
panic("[METRICSTORE]> invalid frequency")
}
metrics[key] = MetricConfig{
Frequency: cfg.Frequency,
Aggregation: cfg.Aggregation,
offset: offset,
}
offset += 1
}
msInstance = &MemoryStore{
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
Metrics: metrics,
}
})
}
func GetMemoryStore() *MemoryStore {
if msInstance == nil {
return nil
}
return msInstance
}
func Shutdown() {
// Check if memorystore was initialized
if msInstance == nil {
cclog.Debug("[METRICSTORE]> MemoryStore not initialized, skipping shutdown")
return
}
// Cancel the context to signal all background goroutines to stop
if shutdownFunc != nil {
shutdownFunc()
}
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
var files int
var err error
ms := GetMemoryStore()
if Keys.Checkpoints.FileFormat == "json" {
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
} else {
files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
close(LineProtocolMessages)
}
if err != nil {
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
}
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
}
func getName(m *MemoryStore, i int) string {
for key, val := range m.Metrics {
if val.offset == i {
return key
}
}
return ""
}
func Retention(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.RetentionInMemory)
if err != nil {
cclog.Fatal(err)
}
if d <= 0 {
return
}
tickInterval := d / 2
if tickInterval <= 0 {
return
}
ticker := time.NewTicker(tickInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
t := time.Now().Add(-d)
cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
freed, err := ms.Free(nil, t.Unix())
if err != nil {
cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
}
}
}
}()
}
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
// Look at `findLevelOrCreate` for how selectors work.
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
var ok bool
for i, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
metric.MetricConfig, ok = m.Metrics[metric.Name]
if !ok {
metric.MetricConfig.Frequency = 0
}
metrics[i] = metric
}
}
return m.WriteToLevel(&m.root, selector, ts, metrics)
}
func (m *MemoryStore) GetLevel(selector []string) *Level {
return m.root.findLevelOrCreate(selector, len(m.Metrics))
}
// WriteToLevel assumes that `minfo` in `metrics` is filled in
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
l = l.findLevelOrCreate(selector, len(m.Metrics))
l.lock.Lock()
defer l.lock.Unlock()
for _, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
continue
}
b := l.metrics[metric.MetricConfig.offset]
if b == nil {
// First write to this metric and level
b = newBuffer(ts, metric.MetricConfig.Frequency)
l.metrics[metric.MetricConfig.offset] = b
}
nb, err := b.write(ts, metric.Value)
if err != nil {
return err
}
// Last write created a new buffer...
if b != nb {
l.metrics[metric.MetricConfig.offset] = nb
}
}
return nil
}
// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
// The second and third return value are the actual from/to for the data. Those can be different from
// the range asked for if no data was available.
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
if from > to {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
}
minfo, ok := m.Metrics[metric]
if !ok {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> unknown metric: " + metric)
}
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
cdata, cfrom, cto, err := b.read(from, to, data)
if err != nil {
return err
}
if n == 0 {
from, to = cfrom, cto
} else if from != cfrom || to != cto || len(data) != len(cdata) {
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
if missingfront != 0 {
return ErrDataDoesNotAlign
}
newlen := len(cdata) - missingback
if newlen < 1 {
return ErrDataDoesNotAlign
}
cdata = cdata[0:newlen]
if len(cdata) != len(data) {
return ErrDataDoesNotAlign
}
from, to = cfrom, cto
}
data = cdata
n += 1
return nil
})
if err != nil {
return nil, 0, 0, 0, err
} else if n == 0 {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
} else if n > 1 {
if minfo.Aggregation == AvgAggregation {
normalize := 1. / schema.Float(n)
for i := 0; i < len(data); i++ {
data[i] *= normalize
}
} else if minfo.Aggregation != SumAggregation {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
}
}
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
if err != nil {
return nil, 0, 0, 0, err
}
return data, from, to, resolution, nil
}
// Free releases all buffers for the selected level and all its children that
// contain only values older than `t`.
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
return m.GetLevel(selector).free(t)
}
func (m *MemoryStore) FreeAll() error {
for k := range m.root.children {
delete(m.root.children, k)
}
return nil
}
func (m *MemoryStore) SizeInBytes() int64 {
return m.root.sizeInBytes()
}
// ListChildren , given a selector, returns a list of all children of the level
// selected.
func (m *MemoryStore) ListChildren(selector []string) []string {
lvl := &m.root
for lvl != nil && len(selector) != 0 {
lvl.lock.RLock()
next := lvl.children[selector[0]]
lvl.lock.RUnlock()
lvl = next
selector = selector[1:]
}
if lvl == nil {
return nil
}
lvl.lock.RLock()
defer lvl.lock.RUnlock()
children := make([]string, 0, len(lvl.children))
for child := range lvl.children {
children = append(children, child)
}
return children
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,127 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
)
type MetricDataRepository interface {
// Initialize this MetricDataRepository. One instance of
// this interface will only ever be responsible for one cluster.
Init(rawConfig json.RawMessage) error
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
LoadScopedStats(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.ScopedJobStats, error)
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
LoadNodeListData(cluster, subCluster string, nodes, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, ctx context.Context) (map[string]schema.JobData, error)
}
var upstreamMetricDataRepo MetricDataRepository
// func Init() error {
// for _, cluster := range config.Clusters {
// if cluster.MetricDataRepository != nil {
// var kind struct {
// Kind string `json:"kind"`
// }
// if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
// cclog.Warn("Error while unmarshaling raw json MetricDataRepository")
// return err
// }
//
// var mdr MetricDataRepository
// switch kind.Kind {
// case "cc-metric-store":
// mdr = &CCMetricStore{}
// case "prometheus":
// mdr = &PrometheusDataRepository{}
// case "test":
// mdr = &TestMetricDataRepository{}
// default:
// return fmt.Errorf("METRICDATA/METRICDATA > Unknown MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
// }
//
// if err := mdr.Init(cluster.MetricDataRepository); err != nil {
// cclog.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
// return err
// }
// metricDataRepos[cluster.Name] = mdr
// }
// }
// return nil
// }
// func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
// var err error
// repo, ok := metricDataRepos[cluster]
//
// if !ok {
// err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
// }
//
// return repo, err
// }
// InitUpstreamRepos initializes global upstream metric data repository for the pull worker
func InitUpstreamRepos() error {
if config.Keys.UpstreamMetricRepository == nil {
return nil
}
var kind struct {
Kind string `json:"kind"`
}
if err := json.Unmarshal(*config.Keys.UpstreamMetricRepository, &kind); err != nil {
cclog.Warn("Error while unmarshaling raw json UpstreamMetricRepository")
return err
}
var mdr MetricDataRepository
switch kind.Kind {
case "cc-metric-store":
mdr = &CCMetricStore{}
case "prometheus":
mdr = &PrometheusDataRepository{}
case "test":
mdr = &TestMetricDataRepository{}
default:
return fmt.Errorf("METRICDATA/METRICDATA > Unknown UpstreamMetricRepository %v", kind.Kind)
}
if err := mdr.Init(*config.Keys.UpstreamMetricRepository); err != nil {
cclog.Errorf("Error initializing UpstreamMetricRepository %v", kind.Kind)
return err
}
upstreamMetricDataRepo = mdr
cclog.Infof("Initialized global upstream metric repository '%s'", kind.Kind)
return nil
}
// GetUpstreamMetricDataRepo returns the global upstream metric data repository
func GetUpstreamMetricDataRepo() (MetricDataRepository, error) {
if upstreamMetricDataRepo == nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > no upstream metric data repository configured")
}
return upstreamMetricDataRepo, nil
}

View File

@@ -1,588 +0,0 @@
// Copyright (C) 2022 DKRZ
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"math"
"net/http"
"os"
"regexp"
"sort"
"strings"
"sync"
"text/template"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
promapi "github.com/prometheus/client_golang/api"
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
promcfg "github.com/prometheus/common/config"
promm "github.com/prometheus/common/model"
)
type PrometheusDataRepositoryConfig struct {
Url string `json:"url"`
Username string `json:"username,omitempty"`
Suffix string `json:"suffix,omitempty"`
Templates map[string]string `json:"query-templates"`
}
type PrometheusDataRepository struct {
client promapi.Client
queryClient promv1.API
suffix string
templates map[string]*template.Template
}
type PromQLArgs struct {
Nodes string
}
type Trie map[rune]Trie
var logOnce sync.Once
func contains(s []schema.MetricScope, str schema.MetricScope) bool {
for _, v := range s {
if v == str {
return true
}
}
return false
}
func MinMaxMean(data []schema.Float) (float64, float64, float64) {
if len(data) == 0 {
return 0.0, 0.0, 0.0
}
min := math.MaxFloat64
max := -math.MaxFloat64
var sum float64
var n float64
for _, val := range data {
if val.IsNaN() {
continue
}
sum += float64(val)
n += 1
if float64(val) > max {
max = float64(val)
}
if float64(val) < min {
min = float64(val)
}
}
return min, max, sum / n
}
// Rewritten from
// https://github.com/ermanh/trieregex/blob/master/trieregex/trieregex.py
func nodeRegex(nodes []string) string {
root := Trie{}
// add runes of each compute node to trie
for _, node := range nodes {
_trie := root
for _, c := range node {
if _, ok := _trie[c]; !ok {
_trie[c] = Trie{}
}
_trie = _trie[c]
}
_trie['*'] = Trie{}
}
// recursively build regex from rune trie
var trieRegex func(trie Trie, reset bool) string
trieRegex = func(trie Trie, reset bool) string {
if reset == true {
trie = root
}
if len(trie) == 0 {
return ""
}
if len(trie) == 1 {
for key, _trie := range trie {
if key == '*' {
return ""
}
return regexp.QuoteMeta(string(key)) + trieRegex(_trie, false)
}
} else {
sequences := []string{}
for key, _trie := range trie {
if key != '*' {
sequences = append(sequences, regexp.QuoteMeta(string(key))+trieRegex(_trie, false))
}
}
sort.Slice(sequences, func(i, j int) bool {
return (-len(sequences[i]) < -len(sequences[j])) || (sequences[i] < sequences[j])
})
var result string
// single edge from this tree node
if len(sequences) == 1 {
result = sequences[0]
if len(result) > 1 {
result = "(?:" + result + ")"
}
// multiple edges, each length 1
} else if s := strings.Join(sequences, ""); len(s) == len(sequences) {
// char or numeric range
if len(s)-1 == int(s[len(s)-1])-int(s[0]) {
result = fmt.Sprintf("[%c-%c]", s[0], s[len(s)-1])
// char or numeric set
} else {
result = "[" + s + "]"
}
// multiple edges of different lengths
} else {
result = "(?:" + strings.Join(sequences, "|") + ")"
}
if _, ok := trie['*']; ok {
result += "?"
}
return result
}
return ""
}
return trieRegex(root, true)
}
func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
var config PrometheusDataRepositoryConfig
// parse config
if err := json.Unmarshal(rawConfig, &config); err != nil {
cclog.Warn("Error while unmarshaling raw json config")
return err
}
// support basic authentication
var rt http.RoundTripper = nil
if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
prom_pw := promcfg.Secret(prom_pw)
rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
} else {
if config.Username != "" {
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
}
}
// init client
client, err := promapi.NewClient(promapi.Config{
Address: config.Url,
RoundTripper: rt,
})
if err != nil {
cclog.Error("Error while initializing new prometheus client")
return err
}
// init query client
pdb.client = client
pdb.queryClient = promv1.NewAPI(pdb.client)
// site config
pdb.suffix = config.Suffix
// init query templates
pdb.templates = make(map[string]*template.Template)
for metric, templ := range config.Templates {
pdb.templates[metric], err = template.New(metric).Parse(templ)
if err == nil {
cclog.Debugf("Added PromQL template for %s: %s", metric, templ)
} else {
cclog.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
}
}
return nil
}
// TODO: respect scope argument
func (pdb *PrometheusDataRepository) FormatQuery(
metric string,
scope schema.MetricScope,
nodes []string,
cluster string,
) (string, error) {
args := PromQLArgs{}
if len(nodes) > 0 {
args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
} else {
args.Nodes = fmt.Sprintf(".*%s", pdb.suffix)
}
buf := &bytes.Buffer{}
if templ, ok := pdb.templates[metric]; ok {
err := templ.Execute(buf, args)
if err != nil {
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > Error compiling template %v", templ))
} else {
query := buf.String()
cclog.Debugf("PromQL: %s", query)
return query, nil
}
} else {
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > No PromQL for metric %s configured.", metric))
}
}
// Convert PromAPI row to CC schema.Series
func (pdb *PrometheusDataRepository) RowToSeries(
from time.Time,
step int64,
steps int64,
row *promm.SampleStream,
) schema.Series {
ts := from.Unix()
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
// init array of expected length with NaN
values := make([]schema.Float, steps+1)
for i := range values {
values[i] = schema.NaN
}
// copy recorded values from prom sample pair
for _, v := range row.Values {
idx := (v.Timestamp.Unix() - ts) / step
values[idx] = schema.Float(v.Value)
}
min, max, mean := MinMaxMean(values)
// output struct
return schema.Series{
Hostname: hostname,
Data: values,
Statistics: schema.MetricStatistics{
Avg: mean,
Min: min,
Max: max,
},
}
}
func (pdb *PrometheusDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
// TODO respect requested scope
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
jobData := make(schema.JobData)
// parse job specs
nodes := make([]string, len(job.Resources))
for i, resource := range job.Resources {
nodes[i] = resource.Hostname
}
from := time.Unix(job.StartTime, 0)
to := time.Unix(job.StartTime+int64(job.Duration), 0)
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(job.Cluster, metric)
if metricConfig == nil {
cclog.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, job.Cluster)
if err != nil {
cclog.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all job nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
cclog.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
cclog.Warnf("Warnings: %v\n", warnings)
}
// init data structures
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
jobMetric, ok := jobData[metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: make([]schema.Series, 0),
}
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
jobMetric.Series = append(jobMetric.Series,
pdb.RowToSeries(from, step, steps, row))
}
// only add metric if at least one host returned data
if !ok && len(jobMetric.Series) > 0 {
jobData[metric][scope] = jobMetric
}
// sort by hostname to get uniform coloring
sort.Slice(jobMetric.Series, func(i, j int) bool {
return (jobMetric.Series[i].Hostname < jobMetric.Series[j].Hostname)
})
}
}
return jobData, nil
}
// TODO change implementation to precomputed/cached stats
func (pdb *PrometheusDataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
// map of metrics of nodes of stats
stats := map[string]map[string]schema.MetricStatistics{}
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
cclog.Warn("Error while loading job for stats")
return nil, err
}
for metric, metricData := range data {
stats[metric] = make(map[string]schema.MetricStatistics)
for _, series := range metricData[schema.MetricScopeNode].Series {
stats[metric][series.Hostname] = series.Statistics
}
}
return stats, nil
}
func (pdb *PrometheusDataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
t0 := time.Now()
// Map of hosts of metrics of value slices
data := make(map[string]map[string][]*schema.JobMetric)
// query db for each metric
// TODO: scopes seems to be always empty
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(cluster, metric)
if metricConfig == nil {
cclog.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
if err != nil {
cclog.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
cclog.Warnf("Warnings: %v\n", warnings)
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
hostdata, ok := data[hostname]
if !ok {
hostdata = make(map[string][]*schema.JobMetric)
data[hostname] = hostdata
}
// output per host and metric
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
},
)
}
}
}
t1 := time.Since(t0)
cclog.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
return data, nil
}
// Implemented by NHR@FAU; Used in Job-View StatsTable
func (pdb *PrometheusDataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable
scopedJobStats := make(schema.ScopedJobStats)
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
cclog.Warn("Error while loading job for scopedJobStats")
return nil, err
}
for metric, metricData := range data {
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for _, series := range metricData[scope].Series {
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: series.Hostname,
Data: &series.Statistics,
})
}
}
}
return scopedJobStats, nil
}
// Implemented by NHR@FAU; Used in NodeList-View
func (pdb *PrometheusDataRepository) LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList
// Fetch Data, based on pdb.LoadNodeData()
t0 := time.Now()
// Map of hosts of jobData
data := make(map[string]schema.JobData)
// query db for each metric
// TODO: scopes seems to be always empty
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(cluster, metric)
if metricConfig == nil {
cclog.Warnf("Error in LoadNodeListData: Metric %s for cluster %s not configured", metric, cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
if err != nil {
cclog.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
cclog.Warnf("Warnings: %v\n", warnings)
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
hostdata, ok := data[hostname]
if !ok {
hostdata = make(schema.JobData)
data[hostname] = hostdata
}
metricdata, ok := hostdata[metric]
if !ok {
metricdata = make(map[schema.MetricScope]*schema.JobMetric)
data[hostname][metric] = metricdata
}
// output per host, metric and scope
scopeData, ok := metricdata[scope]
if !ok {
scopeData = &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
}
data[hostname][metric][scope] = scopeData
}
}
}
}
t1 := time.Since(t0)
cclog.Debugf("LoadNodeListData of %v nodes took %s", len(data), t1)
return data, nil
}

View File

@@ -1,74 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"encoding/json"
"time"
"github.com/ClusterCockpit/cc-lib/schema"
)
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
panic("TODO")
}
// TestMetricDataRepository is only a mock for unit-testing.
type TestMetricDataRepository struct{}
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
return nil
}
func (tmdr *TestMetricDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
}
func (tmdr *TestMetricDataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
panic("TODO")
}

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricdispatcher provides a unified interface for loading and caching job metric data.
// Package metricdispatch provides a unified interface for loading and caching job metric data.
//
// This package serves as a central dispatcher that routes metric data requests to the appropriate
// backend based on job state. For running jobs, data is fetched from the metric store (e.g., cc-metric-store).
@@ -29,13 +29,13 @@
//
// The primary entry point is LoadData, which automatically handles both running and archived jobs:
//
// jobData, err := metricdispatcher.LoadData(job, metrics, scopes, ctx, resolution)
// jobData, err := metricdispatch.LoadData(job, metrics, scopes, ctx, resolution)
// if err != nil {
// // Handle error
// }
//
// For statistics only, use LoadJobStats, LoadScopedJobStats, or LoadAverages depending on the required format.
package metricdispatcher
package metricdispatch
import (
"context"
@@ -44,12 +44,12 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
"github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/lrucache"
"github.com/ClusterCockpit/cc-lib/resampler"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/lrucache"
"github.com/ClusterCockpit/cc-lib/v2/resampler"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// cache is an LRU cache with 128 MB capacity for storing loaded job metric data.
@@ -109,7 +109,7 @@ func LoadData(job *schema.Job,
}
}
jd, err = memorystore.LoadData(job, metrics, scopes, ctx, resolution)
jd, err = metricstore.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil {
if len(jd) != 0 {
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s",
@@ -238,7 +238,7 @@ func LoadAverages(
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
}
stats, err := memorystore.LoadStats(job, metrics, ctx)
stats, err := metricstore.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
@@ -275,7 +275,7 @@ func LoadScopedJobStats(
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
}
scopedStats, err := memorystore.LoadScopedStats(job, metrics, scopes, ctx)
scopedStats, err := metricstore.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
@@ -299,7 +299,7 @@ func LoadJobStats(
data := make(map[string]schema.MetricStatistics, len(metrics))
stats, err := memorystore.LoadStats(job, metrics, ctx)
stats, err := metricstore.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
@@ -348,7 +348,7 @@ func LoadNodeData(
}
}
data, err := memorystore.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
data, err := metricstore.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
@@ -385,7 +385,7 @@ func LoadNodeListData(
}
}
data, err := memorystore.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
data, err := metricstore.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",

View File

@@ -3,12 +3,12 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdispatcher
package metricdispatch
import (
"testing"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
func TestDeepCopy(t *testing.T) {

View File

@@ -3,22 +3,32 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
// This file contains the API types and data fetching logic for querying metric data
// from the in-memory metric store. It provides structures for building complex queries
// with support for aggregation, scaling, padding, and statistics computation.
package metricstore
import (
"errors"
"fmt"
"math"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
)
var (
// ErrInvalidTimeRange is returned when a query has 'from' >= 'to'
ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'")
ErrEmptyCluster = errors.New("[METRICSTORE]> cluster name cannot be empty")
// ErrEmptyCluster is returned when a query with ForAllNodes has no cluster specified
ErrEmptyCluster = errors.New("[METRICSTORE]> cluster name cannot be empty")
)
// APIMetricData represents the response data for a single metric query.
//
// It contains both the time-series data points and computed statistics (avg, min, max).
// If an error occurred during data retrieval, the Error field will be set and other
// fields may be incomplete.
type APIMetricData struct {
Error *string `json:"error,omitempty"`
Data schema.FloatArray `json:"data,omitempty"`
@@ -30,6 +40,13 @@ type APIMetricData struct {
Max schema.Float `json:"max"`
}
// APIQueryRequest represents a batch query request for metric data.
//
// It supports two modes of operation:
// 1. Explicit queries via the Queries field
// 2. Automatic query generation via ForAllNodes (queries all specified metrics for all nodes in the cluster)
//
// The request can be customized with flags to include/exclude statistics, raw data, and padding.
type APIQueryRequest struct {
Cluster string `json:"cluster"`
Queries []APIQuery `json:"queries"`
@@ -41,11 +58,25 @@ type APIQueryRequest struct {
WithPadding bool `json:"with-padding"`
}
// APIQueryResponse represents the response to an APIQueryRequest.
//
// Results is a 2D array where each outer element corresponds to a query,
// and each inner element corresponds to a selector within that query
// (e.g., multiple CPUs or cores).
type APIQueryResponse struct {
Queries []APIQuery `json:"queries,omitempty"`
Results [][]APIMetricData `json:"results"`
}
// APIQuery represents a single metric query with optional hierarchical selectors.
//
// The hierarchical selection works as follows:
// - Hostname: The node to query
// - Type + TypeIds: First level of hierarchy (e.g., "cpu" + ["0", "1", "2"])
// - SubType + SubTypeIds: Second level of hierarchy (e.g., "core" + ["0", "1"])
//
// If Aggregate is true, data from multiple type/subtype IDs will be aggregated according
// to the metric's aggregation strategy. Otherwise, separate results are returned for each combination.
type APIQuery struct {
Type *string `json:"type,omitempty"`
SubType *string `json:"subtype,omitempty"`
@@ -58,6 +89,11 @@ type APIQuery struct {
Aggregate bool `json:"aggreg"`
}
// AddStats computes and populates the Avg, Min, and Max fields from the Data array.
//
// NaN values in the data are ignored during computation. If all values are NaN,
// the statistics fields will be set to NaN.
//
// TODO: Optimize this, just like the stats endpoint!
func (data *APIMetricData) AddStats() {
n := 0
@@ -83,6 +119,10 @@ func (data *APIMetricData) AddStats() {
}
}
// ScaleBy multiplies all data points and statistics by the given factor.
//
// This is commonly used for unit conversion (e.g., bytes to gigabytes).
// Scaling by 0 or 1 is a no-op for performance reasons.
func (data *APIMetricData) ScaleBy(f schema.Float) {
if f == 0 || f == 1 {
return
@@ -96,6 +136,17 @@ func (data *APIMetricData) ScaleBy(f schema.Float) {
}
}
// PadDataWithNull pads the beginning of the data array with NaN values if needed.
//
// This ensures that the data aligns with the requested 'from' timestamp, even if
// the metric store doesn't have data for the earliest time points. This is useful
// for maintaining consistent array indexing across multiple queries.
//
// Parameters:
// - ms: MemoryStore instance to lookup metric configuration
// - from: The requested start timestamp
// - to: The requested end timestamp (unused but kept for API consistency)
// - metric: The metric name to lookup frequency information
func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) {
minfo, ok := ms.Metrics[metric]
if !ok {
@@ -115,6 +166,31 @@ func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metr
}
}
// FetchData executes a batch metric query request and returns the results.
//
// This is the primary API for retrieving metric data from the memory store. It supports:
// - Individual queries via req.Queries
// - Batch queries for all nodes via req.ForAllNodes
// - Hierarchical selector construction (cluster → host → type → subtype)
// - Optional statistics computation (avg, min, max)
// - Optional data scaling
// - Optional data padding with NaN values
//
// The function constructs selectors based on the query parameters and calls MemoryStore.Read()
// for each selector. If a query specifies Aggregate=false with multiple type/subtype IDs,
// separate results are returned for each combination.
//
// Parameters:
// - req: The query request containing queries, time range, and options
//
// Returns:
// - APIQueryResponse containing results for each query, or error if validation fails
//
// Errors:
// - ErrInvalidTimeRange if req.From > req.To
// - ErrEmptyCluster if req.ForAllNodes is used without specifying a cluster
// - Error if MemoryStore is not initialized
// - Individual query errors are stored in APIMetricData.Error field
func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
if req.From > req.To {
return nil, ErrInvalidTimeRange
@@ -126,10 +202,9 @@ func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
req.WithData = true
ms := GetMemoryStore()
if ms == nil {
return nil, fmt.Errorf("memorystore not initialized")
return nil, fmt.Errorf("[METRICSTORE]> memorystore not initialized")
}
response := APIQueryResponse{
Results: make([][]APIMetricData, 0, len(req.Queries)),
}
@@ -196,8 +271,6 @@ func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
}
}
// log.Printf("query: %#v\n", query)
// log.Printf("sels: %#v\n", sels)
var err error
res := make([]APIMetricData, 0, len(sels))
for _, sel := range sels {

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
package metricstore
import (
"archive/zip"
@@ -18,15 +18,39 @@ import (
"sync/atomic"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
// Worker for either Archiving or Deleting files
func Archiving(wg *sync.WaitGroup, ctx context.Context) {
if Keys.Archive != nil {
// Run as Archiver
runWorker(wg, ctx,
Keys.Archive.ArchiveInterval,
"archiving",
Keys.Archive.RootDir,
Keys.Archive.DeleteInstead,
)
} else {
// Run as Deleter
runWorker(wg, ctx,
Keys.RetentionInMemory,
"deleting",
"",
true,
)
}
}
// runWorker takes simple values to configure what it does
func runWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, archiveDir string, delete bool) {
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.Archive.Interval)
d, err := time.ParseDuration(interval)
if err != nil {
cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err)
cclog.Fatalf("[METRICSTORE]> error parsing %s interval duration: %v\n", mode, err)
}
if d <= 0 {
return
@@ -41,14 +65,18 @@ func Archiving(wg *sync.WaitGroup, ctx context.Context) {
return
case <-ticker.C:
t := time.Now().Add(-d)
cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...", t.Format(time.RFC3339))
n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir,
Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead)
cclog.Infof("[METRICSTORE]> start %s checkpoints (older than %s)...", mode, t.Format(time.RFC3339))
n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir, archiveDir, t.Unix(), delete)
if err != nil {
cclog.Errorf("[METRICSTORE]> archiving failed: %s", err.Error())
cclog.Errorf("[METRICSTORE]> %s failed: %s", mode, err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
if delete && archiveDir == "" {
cclog.Infof("[METRICSTORE]> done: %d checkpoints deleted", n)
} else {
cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
}
}
}
}

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
package metricstore
import (
"bufio"
@@ -19,13 +19,15 @@ import (
"sync/atomic"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/linkedin/goavro/v2"
)
var NumAvroWorkers int = DefaultAvroWorkers
var startUp bool = true
var (
NumAvroWorkers int = DefaultAvroWorkers
startUp bool = true
)
func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
levels := make([]*AvroLevel, 0)

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
package metricstore
import (
"context"
@@ -11,7 +11,7 @@ import (
"strconv"
"sync"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
@@ -30,8 +30,51 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
for {
select {
case <-ctx.Done():
return
case val := <-LineProtocolMessages:
// Drain any remaining messages in channel before exiting
for {
select {
case val, ok := <-LineProtocolMessages:
if !ok {
// Channel closed
return
}
// Process remaining message
freq, err := GetMetricFrequency(val.MetricName)
if err != nil {
continue
}
metricName := ""
for _, selectorName := range val.Selector {
metricName += selectorName + SelectorDelimiter
}
metricName += val.MetricName
var selector []string
selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
if !stringSlicesEqual(oldSelector, selector) {
avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
if avroLevel == nil {
cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
}
oldSelector = slices.Clone(selector)
}
if avroLevel != nil {
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
}
default:
// No more messages, exit
return
}
}
case val, ok := <-LineProtocolMessages:
if !ok {
// Channel closed, exit gracefully
return
}
// Fetch the frequency of the metric from the global configuration
freq, err := GetMetricFrequency(val.MetricName)
if err != nil {
@@ -65,7 +108,9 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
oldSelector = slices.Clone(selector)
}
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
if avroLevel != nil {
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
}
}
}
}()

View File

@@ -3,12 +3,12 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
package metricstore
import (
"sync"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
var (

View File

@@ -0,0 +1,322 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstore provides buffer.go: Time-series data buffer implementation.
//
// # Buffer Architecture
//
// Each metric at each hierarchical level (cluster/host/cpu/etc.) uses a linked-list
// chain of fixed-size buffers to store time-series data. This design:
//
// - Avoids reallocation/copying when growing (new links added instead)
// - Enables efficient pooling (buffers returned to sync.Pool)
// - Supports traversal back in time (via prev pointers)
// - Maintains temporal ordering (newer data in later buffers)
//
// # Buffer Chain Example
//
// [oldest buffer] <- prev -- [older] <- prev -- [newest buffer (head)]
// start=1000 start=1512 start=2024
// data=[v0...v511] data=[v0...v511] data=[v0...v42]
//
// When the head buffer reaches capacity (BufferCap = 512), a new buffer becomes
// the new head and the old head is linked via prev.
//
// # Pooling Strategy
//
// sync.Pool reduces GC pressure for the common case (BufferCap-sized allocations).
// Non-standard capacity buffers are not pooled (e.g., from checkpoint deserialization).
//
// # Time Alignment
//
// Timestamps are aligned to measurement frequency intervals:
//
// index = (timestamp - buffer.start) / buffer.frequency
// actualTime = buffer.start + (frequency / 2) + (index * frequency)
//
// Missing data points are represented as NaN values. The read() function performs
// linear interpolation where possible.
package metricstore
import (
"errors"
"sync"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// BufferCap is the default buffer capacity.
// buffer.data will only ever grow up to its capacity and a new link
// in the buffer chain will be created if needed so that no copying
// of data or reallocation needs to happen on writes.
const BufferCap int = DefaultBufferCapacity
var bufferPool sync.Pool = sync.Pool{
New: func() any {
return &buffer{
data: make([]schema.Float, 0, BufferCap),
}
},
}
var (
// ErrNoData indicates no time-series data exists for the requested metric/level.
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
// ErrDataDoesNotAlign indicates that aggregated data from child scopes
// does not align with the parent scope's expected timestamps/intervals.
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
)
// buffer stores time-series data for a single metric at a specific hierarchical level.
//
// Buffers form doubly-linked chains ordered by time. When capacity is reached,
// a new buffer becomes the head and the old head is linked via prev/next.
//
// Fields:
// - prev: Link to older buffer in the chain (nil if this is oldest)
// - next: Link to newer buffer in the chain (nil if this is newest/head)
// - data: Time-series values (schema.Float supports NaN for missing data)
// - frequency: Measurement interval in seconds
// - start: Start timestamp (adjusted by -frequency/2 for alignment)
// - archived: True if data has been persisted to disk archive
// - closed: True if buffer is no longer accepting writes
//
// Index calculation: index = (timestamp - start) / frequency
// Actual data timestamp: start + (frequency / 2) + (index * frequency)
type buffer struct {
prev *buffer
next *buffer
data []schema.Float
frequency int64
start int64
archived bool
closed bool
}
func newBuffer(ts, freq int64) *buffer {
b := bufferPool.Get().(*buffer)
b.frequency = freq
b.start = ts - (freq / 2)
b.prev = nil
b.next = nil
b.archived = false
b.closed = false
b.data = b.data[:0]
return b
}
// write appends a timestamped value to the buffer chain.
//
// Returns the head buffer (which may be newly created if capacity was reached).
// Timestamps older than the buffer's start are rejected. If the calculated index
// exceeds capacity, a new buffer is allocated and linked as the new head.
//
// Missing timestamps are automatically filled with NaN values to maintain alignment.
// Overwrites are allowed if the index is already within the existing data slice.
//
// Parameters:
// - ts: Unix timestamp in seconds
// - value: Metric value (can be schema.NaN for missing data)
//
// Returns:
// - *buffer: The new head buffer (same as b if no new buffer created)
// - error: Non-nil if timestamp is before buffer start
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
if ts < b.start {
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
}
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
idx := int((ts - b.start) / b.frequency)
if idx >= cap(b.data) {
newbuf := newBuffer(ts, b.frequency)
newbuf.prev = b
b.next = newbuf
b = newbuf
idx = 0
}
// Overwriting value or writing value from past
if idx < len(b.data) {
b.data[idx] = value
return b, nil
}
// Fill up unwritten slots with NaN
for i := len(b.data); i < idx; i++ {
b.data = append(b.data, schema.NaN)
}
b.data = append(b.data, value)
return b, nil
}
func (b *buffer) end() int64 {
return b.firstWrite() + int64(len(b.data))*b.frequency
}
func (b *buffer) firstWrite() int64 {
return b.start + (b.frequency / 2)
}
// read retrieves time-series data from the buffer chain for the specified time range.
//
// Traverses the buffer chain backwards (via prev links) if 'from' precedes the current
// buffer's start. Missing data points are represented as NaN. Values are accumulated
// into the provided 'data' slice (using +=, so caller must zero-initialize if needed).
//
// The function adjusts the actual time range returned if data is unavailable at the
// boundaries (returned via adjusted from/to timestamps).
//
// Parameters:
// - from: Start timestamp (Unix seconds)
// - to: End timestamp (Unix seconds, exclusive)
// - data: Pre-allocated slice to accumulate results (must be large enough)
//
// Returns:
// - []schema.Float: Slice of data (may be shorter than input 'data' slice)
// - int64: Actual start timestamp with available data
// - int64: Actual end timestamp (exclusive)
// - error: Non-nil on failure
//
// Panics if 'data' slice is too small to hold all values in [from, to).
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
if from < b.firstWrite() {
if b.prev != nil {
return b.prev.read(from, to, data)
}
from = b.firstWrite()
}
i := 0
t := from
for ; t < to; t += b.frequency {
idx := int((t - b.start) / b.frequency)
if idx >= cap(b.data) {
if b.next == nil {
break
}
b = b.next
idx = 0
}
if idx >= len(b.data) {
if b.next == nil || to <= b.next.start {
break
}
data[i] += schema.NaN
} else if t < b.start {
data[i] += schema.NaN
} else {
data[i] += b.data[idx]
}
i++
}
return data[:i], from, t, nil
}
// free removes buffers older than the specified timestamp from the chain.
//
// Recursively traverses backwards (via prev) and unlinks buffers whose end time
// is before the retention threshold. Freed buffers are returned to the pool if
// they have the standard capacity (BufferCap).
//
// Parameters:
// - t: Retention threshold timestamp (Unix seconds)
//
// Returns:
// - delme: True if the current buffer itself should be deleted by caller
// - n: Number of buffers freed in this subtree
func (b *buffer) free(t int64) (delme bool, n int) {
if b.prev != nil {
delme, m := b.prev.free(t)
n += m
if delme {
b.prev.next = nil
if cap(b.prev.data) == BufferCap {
bufferPool.Put(b.prev)
}
b.prev = nil
}
}
end := b.end()
if end < t {
return true, n + 1
}
return false, n
}
// forceFreeOldest recursively finds the end of the linked list (the oldest buffer)
// and removes it.
// Returns:
//
// delme: true if 'b' itself is the oldest and should be removed by the caller
// n: the number of buffers freed (will be 1 or 0)
func (b *buffer) forceFreeOldest() (delme bool, n int) {
// If there is a previous buffer, recurse down to find the oldest
if b.prev != nil {
delPrev, freed := b.prev.forceFreeOldest()
// If the previous buffer signals it should be deleted:
if delPrev {
// Unlink references
b.prev.next = nil
// Return to pool if capacity matches
if cap(b.prev.data) == BufferCap {
bufferPool.Put(b.prev)
}
// Remove the link from the current buffer
b.prev = nil
}
return false, freed
}
// If b.prev is nil, THIS buffer is the oldest.
// We return true so the parent (or the Level loop) knows to delete reference to 'b'.
return true, 1
}
// iterFromTo invokes callback on every buffer in the chain that overlaps [from, to].
//
// Traverses backwards (via prev) first, then processes current buffer if it overlaps
// the time range. Used for checkpoint/archive operations that need to serialize buffers
// within a specific time window.
//
// Parameters:
// - from: Start timestamp (Unix seconds, inclusive)
// - to: End timestamp (Unix seconds, inclusive)
// - callback: Function to invoke on each overlapping buffer
//
// Returns:
// - error: First error returned by callback, or nil if all succeeded
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
if b == nil {
return nil
}
if err := b.prev.iterFromTo(from, to, callback); err != nil {
return err
}
if from <= b.end() && b.start <= to {
return callback(b)
}
return nil
}
func (b *buffer) count() int64 {
res := int64(len(b.data))
if b.prev != nil {
res += b.prev.count()
}
return res
}

View File

@@ -3,7 +3,35 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
// This file implements checkpoint persistence for the in-memory metric store.
//
// Checkpoints enable graceful restarts by periodically saving in-memory metric
// data to disk in either JSON or Avro format. The checkpoint system:
//
// Key Features:
// - Periodic background checkpointing via the Checkpointing() worker
// - Two formats: JSON (human-readable) and Avro (compact, efficient)
// - Parallel checkpoint creation and loading using worker pools
// - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{json|avro}
// - Only saves unarchived data (archived data is already persisted elsewhere)
// - Automatic format detection and fallback during loading
// - GC optimization during loading to prevent excessive heap growth
//
// Checkpoint Workflow:
// 1. Init() loads checkpoints within retention window at startup
// 2. Checkpointing() worker periodically saves new data
// 3. Shutdown() writes final checkpoint before exit
//
// File Organization:
//
// checkpoints/
// cluster1/
// host001/
// 1234567890.json (timestamp = checkpoint start time)
// 1234567950.json
// host002/
// ...
package metricstore
import (
"bufio"
@@ -23,24 +51,27 @@ import (
"sync/atomic"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/linkedin/goavro/v2"
)
const (
CheckpointFilePerms = 0o644
CheckpointDirPerms = 0o755
GCTriggerInterval = DefaultGCTriggerInterval
CheckpointFilePerms = 0o644 // File permissions for checkpoint files
CheckpointDirPerms = 0o755 // Directory permissions for checkpoint directories
GCTriggerInterval = DefaultGCTriggerInterval // Interval for triggering GC during checkpoint loading
)
// Whenever changed, update MarshalJSON as well!
// CheckpointMetrics represents metric data in a checkpoint file.
// Whenever the structure changes, update MarshalJSON as well!
type CheckpointMetrics struct {
Data []schema.Float `json:"data"`
Frequency int64 `json:"frequency"`
Start int64 `json:"start"`
}
// CheckpointFile represents the hierarchical structure of a checkpoint file.
// It mirrors the Level tree structure from the MemoryStore.
type CheckpointFile struct {
Metrics map[string]*CheckpointMetrics `json:"metrics"`
Children map[string]*CheckpointFile `json:"children"`
@@ -48,10 +79,23 @@ type CheckpointFile struct {
To int64 `json:"to"`
}
var lastCheckpoint time.Time
// lastCheckpoint tracks the timestamp of the last checkpoint creation.
var (
lastCheckpoint time.Time
lastCheckpointMu sync.Mutex
)
// Checkpointing starts a background worker that periodically saves metric data to disk.
//
// The behavior depends on the configured file format:
// - JSON: Periodic checkpointing based on Keys.Checkpoints.Interval
// - Avro: Initial delay + periodic checkpointing at DefaultAvroCheckpointInterval
//
// The worker respects context cancellation and signals completion via the WaitGroup.
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
lastCheckpointMu.Lock()
lastCheckpoint = time.Now()
lastCheckpointMu.Unlock()
if Keys.Checkpoints.FileFormat == "json" {
ms := GetMemoryStore()
@@ -60,9 +104,10 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
defer wg.Done()
d, err := time.ParseDuration(Keys.Checkpoints.Interval)
if err != nil {
cclog.Fatal(err)
cclog.Fatalf("[METRICSTORE]> invalid checkpoint interval '%s': %s", Keys.Checkpoints.Interval, err.Error())
}
if d <= 0 {
cclog.Warnf("[METRICSTORE]> checkpoint interval is zero or negative (%s), checkpointing disabled", d)
return
}
@@ -74,15 +119,21 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
case <-ctx.Done():
return
case <-ticker.C:
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", lastCheckpoint.Format(time.RFC3339))
lastCheckpointMu.Lock()
from := lastCheckpoint
lastCheckpointMu.Unlock()
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339))
now := time.Now()
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
lastCheckpoint.Unix(), now.Unix())
from.Unix(), now.Unix())
if err != nil {
cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n)
lastCheckpointMu.Lock()
lastCheckpoint = now
lastCheckpointMu.Unlock()
}
}
}
@@ -113,9 +164,10 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
}
}
// As `Float` implements a custom MarshalJSON() function,
// serializing an array of such types has more overhead
// than one would assume (because of extra allocations, interfaces and so on).
// MarshalJSON provides optimized JSON encoding for CheckpointMetrics.
//
// Since schema.Float has custom MarshalJSON, serializing []Float has significant overhead.
// This method manually constructs JSON to avoid allocations and interface conversions.
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 128+len(cm.Data)*8)
buf = append(buf, `{"frequency":`...)
@@ -137,13 +189,27 @@ func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
return buf, nil
}
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
// The good thing: Only a host at a time is locked, so this function can run
// in parallel to writes/reads.
// ToCheckpoint writes metric data to checkpoint files in parallel.
//
// Metrics at root and cluster levels are skipped. One file per host is created.
// Uses worker pool (Keys.NumWorkers) for parallel processing. Only locks one host
// at a time, allowing concurrent writes/reads to other hosts.
//
// Returns the number of checkpoint files created and any errors encountered.
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
levels := make([]*Level, 0)
selectors := make([][]string, 0)
// Pre-calculate capacity by counting cluster/host pairs
m.root.lock.RLock()
totalHosts := 0
for _, l1 := range m.root.children {
l1.lock.RLock()
totalHosts += len(l1.children)
l1.lock.RUnlock()
}
m.root.lock.RUnlock()
levels := make([]*Level, 0, totalHosts)
selectors := make([][]string, 0, totalHosts)
m.root.lock.RLock()
for sel1, l1 := range m.root.children {
l1.lock.RLock()
@@ -203,6 +269,8 @@ func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
return int(n), nil
}
// toCheckpointFile recursively converts a Level tree to CheckpointFile structure.
// Skips metrics that are already archived. Returns nil if no unarchived data exists.
func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
l.lock.RLock()
defer l.lock.RUnlock()
@@ -224,6 +292,7 @@ func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFil
b.iterFromTo(from, to, func(b *buffer) error {
if !b.archived {
allArchived = false
return fmt.Errorf("stop") // Early termination signal
}
return nil
})
@@ -267,6 +336,8 @@ func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFil
return retval, nil
}
// toCheckpoint writes a Level's data to a JSON checkpoint file.
// Creates directory if needed. Returns ErrNoNewArchiveData if nothing to save.
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
cf, err := l.toCheckpointFile(from, to, m)
if err != nil {
@@ -278,11 +349,11 @@ func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
}
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(dir, CheckpointDirPerms)
if err == nil {
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
}
}
if err != nil {
@@ -298,9 +369,54 @@ func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
return bw.Flush()
}
// enqueueCheckpointHosts traverses checkpoint directory and enqueues cluster/host pairs.
// Returns error if directory structure is invalid.
func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
clustersDir, err := os.ReadDir(dir)
if err != nil {
return err
}
gcCounter := 0
for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
return errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
}
hostsDir, err := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if err != nil {
return err
}
for _, hostDir := range hostsDir {
if !hostDir.IsDir() {
return errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
}
gcCounter++
if gcCounter%GCTriggerInterval == 0 {
// Forcing garbage collection runs here regulary during the loading of checkpoints
// will decrease the total heap size after loading everything back to memory is done.
// While loading data, the heap will grow fast, so the GC target size will double
// almost always. By forcing GCs here, we can keep it growing more slowly so that
// at the end, less memory is wasted.
runtime.GC()
}
work <- [2]string{clusterDir.Name(), hostDir.Name()}
}
}
return nil
}
// FromCheckpoint loads checkpoint files from disk into memory in parallel.
//
// Uses worker pool to load cluster/host combinations. Periodically triggers GC
// to prevent excessive heap growth. Returns number of files loaded and any errors.
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
var wg sync.WaitGroup
work := make(chan [2]string, Keys.NumWorkers)
work := make(chan [2]string, Keys.NumWorkers*4)
n, errs := int32(0), int32(0)
wg.Add(Keys.NumWorkers)
@@ -319,40 +435,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (
}()
}
i := 0
clustersDir, err := os.ReadDir(dir)
for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
goto done
}
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if e != nil {
err = e
goto done
}
for _, hostDir := range hostsDir {
if !hostDir.IsDir() {
err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
goto done
}
i++
if i%Keys.NumWorkers == 0 && i > GCTriggerInterval {
// Forcing garbage collection runs here regulary during the loading of checkpoints
// will decrease the total heap size after loading everything back to memory is done.
// While loading data, the heap will grow fast, so the GC target size will double
// almost always. By forcing GCs here, we can keep it growing more slowly so that
// at the end, less memory is wasted.
runtime.GC()
}
work <- [2]string{clusterDir.Name(), hostDir.Name()}
}
}
done:
err := enqueueCheckpointHosts(dir, work)
close(work)
wg.Wait()
@@ -366,9 +449,11 @@ done:
return int(n), nil
}
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
// This function can only be called once and before the very first write or read.
// Different host's data is loaded to memory in parallel.
// FromCheckpointFiles is the main entry point for loading checkpoints at startup.
//
// Automatically detects checkpoint format (JSON vs Avro) and falls back if needed.
// Creates checkpoint directory if it doesn't exist. This function must be called
// before any writes or reads, and can only be called once.
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
if _, err := os.Stat(dir); os.IsNotExist(err) {
// The directory does not exist, so create it using os.MkdirAll()
@@ -408,10 +493,10 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
return m.FromCheckpoint(dir, from, altFormat)
}
cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory")
return 0, nil
}
// checkFilesWithExtension walks a directory tree to check if files with the given extension exist.
func checkFilesWithExtension(dir string, extension string) (bool, error) {
found := false
@@ -731,13 +816,24 @@ func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRece
return nums[a.Name()] < nums[b.Name()]
})
if len(nums) == 0 {
return nil, nil
}
filenames := make([]string, 0)
for i := range direntries {
e := direntries[i]
for i, e := range direntries {
ts1 := nums[e.Name()]
// Logic to look for files in forward or direction
// If logic: All files greater than or after
// the given timestamp will be selected
// Else If logic: All files less than or before
// the given timestamp will be selected
if findMoreRecentFiles && t <= ts1 {
filenames = append(filenames, e.Name())
} else if !findMoreRecentFiles && ts1 <= t && ts1 != 0 {
filenames = append(filenames, e.Name())
}
if i == len(direntries)-1 {
continue
@@ -750,10 +846,6 @@ func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRece
if ts1 < t && t < ts2 {
filenames = append(filenames, e.Name())
}
} else {
if ts2 < t {
filenames = append(filenames, e.Name())
}
}
}

View File

@@ -0,0 +1,257 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstore provides config.go: Configuration structures and metric management.
//
// # Configuration Hierarchy
//
// The metricstore package uses nested configuration structures:
//
// MetricStoreConfig (Keys)
// ├─ NumWorkers: Parallel checkpoint/archive workers
// ├─ RetentionInMemory: How long to keep data in RAM
// ├─ MemoryCap: Memory limit in bytes (triggers forceFree)
// ├─ Checkpoints: Persistence configuration
// │ ├─ FileFormat: "avro" or "json"
// │ ├─ Interval: How often to save (e.g., "1h")
// │ └─ RootDir: Checkpoint storage path
// ├─ Archive: Long-term storage configuration
// │ ├─ ArchiveInterval: How often to archive
// │ ├─ RootDir: Archive storage path
// │ └─ DeleteInstead: Delete old data instead of archiving
// ├─ Debug: Development/debugging options
// └─ Subscriptions: NATS topic subscriptions for metric ingestion
//
// # Metric Configuration
//
// Each metric (e.g., "cpu_load", "mem_used") has a MetricConfig entry in the global
// Metrics map, defining:
//
// - Frequency: Measurement interval in seconds
// - Aggregation: How to combine values (sum/avg/none) when transforming scopes
// - offset: Internal index into Level.metrics slice (assigned during Init)
//
// # AggregationStrategy
//
// Determines how to combine metric values when aggregating from finer to coarser scopes:
//
// - NoAggregation: Do not combine (incompatible scopes)
// - SumAggregation: Add values (e.g., power consumption: core→socket)
// - AvgAggregation: Average values (e.g., temperature: core→socket)
package metricstore
import (
"fmt"
"time"
)
const (
DefaultMaxWorkers = 10
DefaultBufferCapacity = 512
DefaultGCTriggerInterval = 100
DefaultAvroWorkers = 4
DefaultCheckpointBufferMin = 3
DefaultAvroCheckpointInterval = time.Minute
)
// Checkpoints configures periodic persistence of in-memory metric data.
//
// Fields:
// - FileFormat: "avro" (default, binary, compact) or "json" (human-readable, slower)
// - Interval: Duration string (e.g., "1h", "30m") between checkpoint saves
// - RootDir: Filesystem path for checkpoint files (created if missing)
type Checkpoints struct {
FileFormat string `json:"file-format"`
Interval string `json:"interval"`
RootDir string `json:"directory"`
}
// Debug provides development and profiling options.
//
// Fields:
// - DumpToFile: Path to dump checkpoint data for inspection (empty = disabled)
// - EnableGops: Enable gops agent for live runtime debugging (https://github.com/google/gops)
type Debug struct {
DumpToFile string `json:"dump-to-file"`
EnableGops bool `json:"gops"`
}
// Archive configures long-term storage of old metric data.
//
// Data older than RetentionInMemory is archived to disk or deleted.
//
// Fields:
// - ArchiveInterval: Duration string (e.g., "24h") between archive operations
// - RootDir: Filesystem path for archived data (created if missing)
// - DeleteInstead: If true, delete old data instead of archiving (saves disk space)
type Archive struct {
ArchiveInterval string `json:"interval"`
RootDir string `json:"directory"`
DeleteInstead bool `json:"delete-instead"`
}
// Subscriptions defines NATS topics to subscribe to for metric ingestion.
//
// Each subscription receives metrics via NATS messaging, enabling real-time
// data collection from compute nodes.
//
// Fields:
// - SubscribeTo: NATS subject/channel name (e.g., "metrics.compute.*")
// - ClusterTag: Default cluster name for metrics without cluster tag (optional)
type Subscriptions []struct {
// Channel name
SubscribeTo string `json:"subscribe-to"`
// Allow lines without a cluster tag, use this as default, optional
ClusterTag string `json:"cluster-tag"`
}
// MetricStoreConfig defines the main configuration for the metricstore.
//
// Loaded from cc-backend's config.json "metricstore" section. Controls memory usage,
// persistence, archiving, and metric ingestion.
//
// Fields:
// - NumWorkers: Parallel workers for checkpoint/archive (0 = auto: min(NumCPU/2+1, 10))
// - RetentionInMemory: Duration string (e.g., "48h") for in-memory data retention
// - MemoryCap: Max bytes for buffer data (0 = unlimited); triggers forceFree when exceeded
// - Checkpoints: Periodic persistence configuration
// - Debug: Development/profiling options (nil = disabled)
// - Archive: Long-term storage configuration (nil = disabled)
// - Subscriptions: NATS topics for metric ingestion (nil = polling only)
type MetricStoreConfig struct {
// Number of concurrent workers for checkpoint and archive operations.
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
NumWorkers int `json:"num-workers"`
RetentionInMemory string `json:"retention-in-memory"`
MemoryCap int `json:"memory-cap"`
Checkpoints Checkpoints `json:"checkpoints"`
Debug *Debug `json:"debug"`
Archive *Archive `json:"archive"`
Subscriptions *Subscriptions `json:"nats-subscriptions"`
}
// Keys is the global metricstore configuration instance.
//
// Initialized with defaults, then overwritten by cc-backend's config.json.
// Accessed by Init(), Checkpointing(), and other lifecycle functions.
var Keys MetricStoreConfig = MetricStoreConfig{
Checkpoints: Checkpoints{
FileFormat: "avro",
RootDir: "./var/checkpoints",
},
}
// AggregationStrategy defines how to combine metric values across hierarchy levels.
//
// Used when transforming data from finer-grained scopes (e.g., core) to coarser scopes
// (e.g., socket). This is SPATIAL aggregation, not TEMPORAL (time-based) aggregation.
//
// Values:
// - NoAggregation: Do not aggregate (incompatible scopes or non-aggregatable metrics)
// - SumAggregation: Add values (e.g., power: sum core power → socket power)
// - AvgAggregation: Average values (e.g., temperature: average core temps → socket temp)
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota // Do not aggregate
SumAggregation // Sum values (e.g., power, energy)
AvgAggregation // Average values (e.g., temperature, utilization)
)
// AssignAggregationStrategy parses a string into an AggregationStrategy value.
//
// Used when loading metric configurations from JSON/YAML files.
//
// Parameters:
// - str: "sum", "avg", or "" (empty string for NoAggregation)
//
// Returns:
// - AggregationStrategy: Parsed value
// - error: Non-nil if str is unrecognized
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
switch str {
case "":
return NoAggregation, nil
case "sum":
return SumAggregation, nil
case "avg":
return AvgAggregation, nil
default:
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
}
}
// MetricConfig defines configuration for a single metric type.
//
// Stored in the global Metrics map, keyed by metric name (e.g., "cpu_load").
//
// Fields:
// - Frequency: Measurement interval in seconds (e.g., 60 for 1-minute granularity)
// - Aggregation: How to combine values across hierarchy levels (sum/avg/none)
// - offset: Internal index into Level.metrics slice (assigned during Init)
type MetricConfig struct {
// Interval in seconds at which measurements are stored
Frequency int64
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy
// Private, used internally...
offset int
}
// Metrics is the global map of metric configurations.
//
// Keyed by metric name (e.g., "cpu_load", "mem_used"). Populated during Init()
// from cluster configuration and checkpoint restoration. Each MetricConfig.offset
// corresponds to the buffer slice index in Level.metrics.
var Metrics map[string]MetricConfig
// GetMetricFrequency retrieves the measurement interval for a metric.
//
// Parameters:
// - metricName: Metric name (e.g., "cpu_load")
//
// Returns:
// - int64: Frequency in seconds
// - error: Non-nil if metric not found in Metrics map
func GetMetricFrequency(metricName string) (int64, error) {
if metric, ok := Metrics[metricName]; ok {
return metric.Frequency, nil
}
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
}
// AddMetric registers a new metric or updates an existing one.
//
// If the metric already exists with a different frequency, uses the higher frequency
// (finer granularity). This handles cases where different clusters report the same
// metric at different intervals.
//
// Parameters:
// - name: Metric name (e.g., "cpu_load")
// - metric: Configuration (frequency, aggregation strategy)
//
// Returns:
// - error: Always nil (signature for future error handling)
func AddMetric(name string, metric MetricConfig) error {
if Metrics == nil {
Metrics = make(map[string]MetricConfig, 0)
}
if existingMetric, ok := Metrics[name]; ok {
if existingMetric.Frequency != metric.Frequency {
if existingMetric.Frequency < metric.Frequency {
existingMetric.Frequency = metric.Frequency
Metrics[name] = existingMetric
}
}
} else {
Metrics[name] = metric
}
return nil
}

View File

@@ -0,0 +1,77 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricstore
const configSchema = `{
"type": "object",
"description": "Configuration specific to built-in metric-store.",
"properties": {
"num-workers": {
"description": "Number of concurrent workers for checkpoint and archive operations",
"type": "integer"
},
"checkpoints": {
"description": "Configuration for checkpointing the metrics within metric-store",
"type": "object",
"properties": {
"file-format": {
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
"type": "string"
},
"interval": {
"description": "Interval at which the metrics should be checkpointed.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the checkpointed files should be placed.",
"type": "string"
}
},
"required": ["interval"]
},
"archive": {
"description": "Configuration for archiving the already checkpointed files.",
"type": "object",
"properties": {
"interval": {
"description": "Interval at which the checkpointed files should be archived.",
"type": "string"
},
"directory": {
"description": "Specify the directy in which the archived files should be placed.",
"type": "string"
}
},
"required": ["interval", "directory"]
},
"retention-in-memory": {
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
"type": "string"
},
"memory-cap": {
"description": "Upper memory capacity limit used by metricstore in GB",
"type": "integer"
},
"nats-subscriptions": {
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
"type": "array",
"items": {
"type": "object",
"properties": {
"subscribe-to": {
"description": "Channel name",
"type": "string"
},
"cluster-tag": {
"description": "Optional: Allow lines without a cluster tag, use this as default",
"type": "string"
}
}
}
}
},
"required": ["checkpoints", "retention-in-memory", "memory-cap"]
}`

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
package metricstore
import (
"bufio"

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
package metricstore
import (
"bufio"

View File

@@ -0,0 +1,385 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstore provides level.go: Hierarchical tree structure for metric storage.
//
// # Level Architecture
//
// The Level type forms a tree structure where each node represents a level in the
// ClusterCockpit hierarchy: cluster → host → socket → core → hwthread, with special
// nodes for memory domains and accelerators.
//
// Structure:
//
// Root Level (cluster="emmy")
// ├─ Level (host="node001")
// │ ├─ Level (socket="0")
// │ │ ├─ Level (core="0") [stores cpu0 metrics]
// │ │ └─ Level (core="1") [stores cpu1 metrics]
// │ └─ Level (socket="1")
// │ └─ ...
// └─ Level (host="node002")
// └─ ...
//
// Each Level can:
// - Hold data (metrics slice of buffer pointers)
// - Have child nodes (children map[string]*Level)
// - Both simultaneously (inner nodes can store aggregated metrics)
//
// # Selector Paths
//
// Selectors are hierarchical paths: []string{"cluster", "host", "component"}.
// Example: []string{"emmy", "node001", "cpu0"} navigates to the cpu0 core level.
//
// # Concurrency
//
// RWMutex protects children map and metrics slice. Read-heavy workload (metric reads)
// uses RLock. Writes (new levels, buffer updates) use Lock. Double-checked locking
// prevents races during level creation.
package metricstore
import (
"sync"
"unsafe"
"github.com/ClusterCockpit/cc-lib/v2/util"
)
// Level represents a node in the hierarchical metric storage tree.
//
// Can be both a leaf or inner node. Inner nodes hold data in 'metrics' for aggregated
// values (e.g., socket-level metrics derived from core-level data). Named "Level"
// instead of "node" to avoid confusion with cluster nodes (hosts).
//
// Fields:
// - children: Map of child level names to Level pointers (e.g., "cpu0" → Level)
// - metrics: Slice of buffer pointers (one per metric, indexed by MetricConfig.offset)
// - lock: RWMutex for concurrent access (read-heavy, write-rare)
type Level struct {
children map[string]*Level
metrics []*buffer
lock sync.RWMutex
}
// findLevelOrCreate navigates to or creates the level specified by selector.
//
// Recursively descends the tree, creating missing levels as needed. Uses double-checked
// locking: RLock first (fast path), then Lock if creation needed (slow path), then
// re-check after acquiring Lock to handle races.
//
// Example selector: []string{"emmy", "node001", "cpu0"}
// Navigates: root → emmy → node001 → cpu0, creating levels as needed.
//
// Parameters:
// - selector: Hierarchical path (consumed recursively, decreasing depth)
// - nMetrics: Number of metric slots to allocate in new levels
//
// Returns:
// - *Level: The target level (existing or newly created)
//
// Note: sync.Map may improve performance for high-concurrency writes, but current
// approach suffices for read-heavy workload.
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *Level
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok = l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
// The level does not exist, take write lock for unique access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
child = &Level{
metrics: make([]*buffer, nMetrics),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*Level{selector[0]: child}
}
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
// collectPaths gathers all selector paths at the specified depth in the tree.
//
// Recursively traverses children, collecting paths when currentDepth+1 == targetDepth.
// Each path is a selector that can be used with findLevel() or findBuffers().
//
// Explicitly copies slices to avoid shared underlying arrays between siblings, preventing
// unintended mutations.
//
// Parameters:
// - currentDepth: Depth of current level (0 = root)
// - targetDepth: Depth to collect paths from
// - currentPath: Path accumulated so far
// - results: Output slice (appended to)
//
// Example: collectPaths(0, 2, []string{}, &results) collects all 2-level paths
// like []string{"emmy", "node001"}, []string{"emmy", "node002"}, etc.
func (l *Level) collectPaths(currentDepth, targetDepth int, currentPath []string, results *[][]string) {
l.lock.RLock()
defer l.lock.RUnlock()
for key, child := range l.children {
if child == nil {
continue
}
// We explicitly make a new slice and copy data to avoid sharing underlying arrays between siblings
newPath := make([]string, len(currentPath))
copy(newPath, currentPath)
newPath = append(newPath, key)
// Check depth, and just return if depth reached
if currentDepth+1 == targetDepth {
*results = append(*results, newPath)
} else {
child.collectPaths(currentDepth+1, targetDepth, newPath, results)
}
}
}
// free removes buffers older than the retention threshold from the entire subtree.
//
// Recursively frees buffers in this level's metrics and all child levels. Buffers
// with standard capacity (BufferCap) are returned to the pool. Called by the
// retention worker to enforce retention policies.
//
// Parameters:
// - t: Retention threshold timestamp (Unix seconds)
//
// Returns:
// - int: Total number of buffers freed in this subtree
// - error: Non-nil on failure (propagated from children)
func (l *Level) free(t int64) (int, error) {
l.lock.Lock()
defer l.lock.Unlock()
n := 0
for i, b := range l.metrics {
if b != nil {
delme, m := b.free(t)
n += m
if delme {
if cap(b.data) == BufferCap {
bufferPool.Put(b)
}
l.metrics[i] = nil
}
}
}
for _, l := range l.children {
m, err := l.free(t)
n += m
if err != nil {
return n, err
}
}
return n, nil
}
// forceFree removes the oldest buffer from each metric chain in the subtree.
//
// Unlike free(), which removes based on time threshold, this unconditionally removes
// the oldest buffer in each chain. Used by MemoryUsageTracker when memory cap is
// exceeded and time-based retention is insufficient.
//
// Recursively processes current level's metrics and all child levels.
//
// Returns:
// - int: Total number of buffers freed in this subtree
// - error: Non-nil on failure (propagated from children)
func (l *Level) forceFree() (int, error) {
l.lock.Lock()
defer l.lock.Unlock()
n := 0
// Iterate over metrics in the current level
for i, b := range l.metrics {
if b != nil {
// Attempt to free the oldest buffer in this chain
delme, freedCount := b.forceFreeOldest()
n += freedCount
// If delme is true, it means 'b' itself (the head) was the oldest
// and needs to be removed from the slice.
if delme {
if cap(b.data) == BufferCap {
bufferPool.Put(b)
}
l.metrics[i] = nil
}
}
}
// Recursively traverse children
for _, child := range l.children {
m, err := child.forceFree()
n += m
if err != nil {
return n, err
}
}
return n, nil
}
// sizeInBytes calculates the total memory usage of all buffers in the subtree.
//
// Recursively sums buffer data sizes (count of Float values × sizeof(Float)) across
// this level's metrics and all child levels. Used by MemoryUsageTracker to enforce
// memory cap limits.
//
// Returns:
// - int64: Total bytes used by buffer data in this subtree
func (l *Level) sizeInBytes() int64 {
l.lock.RLock()
defer l.lock.RUnlock()
size := int64(0)
for _, b := range l.metrics {
if b != nil {
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
}
}
for _, child := range l.children {
size += child.sizeInBytes()
}
return size
}
// findLevel navigates to the level specified by selector, returning nil if not found.
//
// Read-only variant of findLevelOrCreate. Does not create missing levels.
// Recursively descends the tree following the selector path.
//
// Parameters:
// - selector: Hierarchical path (e.g., []string{"emmy", "node001", "cpu0"})
//
// Returns:
// - *Level: The target level, or nil if any component in the path does not exist
func (l *Level) findLevel(selector []string) *Level {
if len(selector) == 0 {
return l
}
l.lock.RLock()
defer l.lock.RUnlock()
lvl := l.children[selector[0]]
if lvl == nil {
return nil
}
return lvl.findLevel(selector[1:])
}
// findBuffers invokes callback on all buffers matching the selector pattern.
//
// Supports flexible selector patterns (from cc-lib/util.Selector):
// - Exact match: Selector element with String set (e.g., "node001")
// - Group match: Selector element with Group set (e.g., ["cpu0", "cpu2", "cpu4"])
// - Wildcard: Selector element with Any=true (matches all children)
//
// Empty selector (len==0) matches current level's buffer at 'offset' and recursively
// all descendant buffers at the same offset (used for aggregation queries).
//
// Parameters:
// - selector: Pattern to match (consumed recursively)
// - offset: Metric index in metrics slice (from MetricConfig.offset)
// - f: Callback invoked on each matching buffer
//
// Returns:
// - error: First error returned by callback, or nil if all succeeded
//
// Example:
//
// // Find all cpu0 buffers across all hosts:
// findBuffers([]Selector{{Any: true}, {String: "cpu0"}}, metricOffset, callback)
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
l.lock.RLock()
defer l.lock.RUnlock()
if len(selector) == 0 {
b := l.metrics[offset]
if b != nil {
return f(b)
}
for _, lvl := range l.children {
err := lvl.findBuffers(nil, offset, f)
if err != nil {
return err
}
}
return nil
}
sel := selector[0]
if len(sel.String) != 0 && l.children != nil {
lvl, ok := l.children[sel.String]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
return nil
}
if sel.Group != nil && l.children != nil {
for _, key := range sel.Group {
lvl, ok := l.children[key]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
}
return nil
}
if sel.Any && l.children != nil {
for _, lvl := range l.children {
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
return err
}
}
return nil
}
return nil
}

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
package metricstore
import (
"context"
@@ -12,8 +12,8 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/pkg/nats"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/influxdata/line-protocol/v2/lineprotocol"
)
@@ -29,29 +29,30 @@ func ReceiveNats(ms *MemoryStore,
}
var wg sync.WaitGroup
msgs := make(chan []byte, workers*2)
for _, sc := range Keys.Subscriptions {
for _, sc := range *Keys.Subscriptions {
clusterTag := sc.ClusterTag
if workers > 1 {
wg.Add(workers)
for range workers {
go func() {
defer wg.Done()
for m := range msgs {
dec := lineprotocol.NewDecoderWithBytes(m)
if err := DecodeLine(dec, ms, clusterTag); err != nil {
cclog.Errorf("error: %s", err.Error())
}
}
wg.Done()
}()
}
nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) {
msgs <- data
select {
case msgs <- data:
case <-ctx.Done():
}
})
} else {
nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) {
@@ -64,7 +65,11 @@ func ReceiveNats(ms *MemoryStore,
cclog.Infof("NATS subscription to '%s' established", sc.SubscribeTo)
}
close(msgs)
go func() {
<-ctx.Done()
close(msgs)
}()
wg.Wait()
return nil

View File

@@ -0,0 +1,750 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstore provides an efficient in-memory time-series metric storage system
// with support for hierarchical data organization, checkpointing, and archiving.
//
// The package organizes metrics in a tree structure (cluster → host → component) and
// provides concurrent read/write access to metric data with configurable aggregation strategies.
// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
// and enforcing retention policies.
//
// Key features:
// - In-memory metric storage with configurable retention
// - Hierarchical data organization (selectors)
// - Concurrent checkpoint/archive workers
// - Support for sum and average aggregation
// - NATS integration for metric ingestion
package metricstore
import (
"bytes"
"context"
"encoding/json"
"errors"
"runtime"
"slices"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/resampler"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
)
var (
singleton sync.Once
msInstance *MemoryStore
// shutdownFunc stores the context cancellation function created in Init
// and is called during Shutdown to cancel all background goroutines
shutdownFunc context.CancelFunc
shutdownFuncMu sync.Mutex // Protects shutdownFunc from concurrent access
)
// NodeProvider provides information about nodes currently in use by running jobs.
//
// This interface allows metricstore to query job information without directly
// depending on the repository package, breaking the import cycle.
//
// Implementations should return nodes that are actively processing jobs started
// before the given timestamp. These nodes will be excluded from retention-based
// garbage collection to prevent data loss for jobs that are still running or
// recently completed.
type NodeProvider interface {
// GetUsedNodes returns a map of cluster names to sorted lists of unique hostnames
// that are currently in use by jobs that started before the given timestamp.
//
// Parameters:
// - ts: Unix timestamp threshold - returns nodes with jobs started before this time
//
// Returns:
// - Map of cluster names to lists of node hostnames that should be excluded from garbage collection
// - Error if the query fails
GetUsedNodes(ts int64) (map[string][]string, error)
}
// Metric represents a single metric data point to be written to the store.
type Metric struct {
Name string
Value schema.Float
// MetricConfig contains frequency and aggregation settings for this metric.
// If Frequency is 0, configuration will be looked up from MemoryStore.Metrics during Write().
MetricConfig MetricConfig
}
// MemoryStore is the main in-memory time-series metric storage implementation.
//
// It organizes metrics in a hierarchical tree structure where each level represents
// a component of the system hierarchy (e.g., cluster → host → CPU). Each level can
// store multiple metrics as time-series buffers.
//
// The store is initialized as a singleton via InitMetrics() and accessed via GetMemoryStore().
// All public methods are safe for concurrent use.
type MemoryStore struct {
Metrics map[string]MetricConfig
root Level
nodeProvider NodeProvider
}
// Init initializes the metric store from configuration and starts background workers.
//
// This function must be called exactly once before any other metricstore operations.
// It performs the following initialization steps:
// 1. Validates and decodes the metric store configuration
// 2. Configures worker pool size (defaults to NumCPU/2+1, max 10)
// 3. Loads metric configurations from all registered clusters
// 4. Restores checkpoints within the retention window
// 5. Starts background workers for retention, checkpointing, archiving, and monitoring
// 6. Optionally subscribes to NATS for real-time metric ingestion
//
// Parameters:
// - rawConfig: JSON configuration for the metric store (see MetricStoreConfig)
// - wg: WaitGroup that will be incremented for each background goroutine started
//
// The function will call cclog.Fatal on critical errors during initialization.
// Use Shutdown() to cleanly stop all background workers started by Init().
//
// Note: Signal handling must be implemented by the caller. Call Shutdown() when
// receiving termination signals to ensure checkpoint data is persisted.
func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
startupTime := time.Now()
if rawConfig != nil {
config.Validate(configSchema, rawConfig)
dec := json.NewDecoder(bytes.NewReader(rawConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
}
}
// Set NumWorkers from config or use default
if Keys.NumWorkers <= 0 {
Keys.NumWorkers = min(runtime.NumCPU()/2+1, DefaultMaxWorkers)
}
cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers)
// Helper function to add metric configuration
addMetricConfig := func(mc *schema.MetricConfig) {
agg, err := AssignAggregationStrategy(mc.Aggregation)
if err != nil {
cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error())
}
AddMetric(mc.Name, MetricConfig{
Frequency: int64(mc.Timestep),
Aggregation: agg,
})
}
for _, c := range archive.Clusters {
for _, mc := range c.MetricConfig {
addMetricConfig(mc)
}
for _, sc := range c.SubClusters {
for _, mc := range sc.MetricConfig {
addMetricConfig(mc)
}
}
}
// Pass the config.MetricStoreKeys
InitMetrics(Metrics)
ms := GetMemoryStore()
d, err := time.ParseDuration(Keys.RetentionInMemory)
if err != nil {
cclog.Fatal(err)
}
restoreFrom := startupTime.Add(-d)
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
if err != nil {
cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
}
// Try to use less memory by forcing a GC run here and then
// lowering the target percentage. The default of 100 means
// that only once the ratio of new allocations execeds the
// previously active heap, a GC is triggered.
// Forcing a GC here will set the "previously active heap"
// to a minumum.
runtime.GC()
ctx, shutdown := context.WithCancel(context.Background())
retentionGoroutines := 1
checkpointingGoroutines := 1
dataStagingGoroutines := 1
archivingGoroutines := 1
memoryUsageTracker := 1
totalGoroutines := retentionGoroutines +
checkpointingGoroutines +
dataStagingGoroutines +
archivingGoroutines +
memoryUsageTracker
wg.Add(totalGoroutines)
Retention(wg, ctx)
Checkpointing(wg, ctx)
Archiving(wg, ctx)
DataStaging(wg, ctx)
MemoryUsageTracker(wg, ctx)
// Note: Signal handling has been removed from this function.
// The caller is responsible for handling shutdown signals and calling
// the shutdown() function when appropriate.
// Store the shutdown function for later use by Shutdown()
shutdownFuncMu.Lock()
shutdownFunc = shutdown
shutdownFuncMu.Unlock()
if Keys.Subscriptions != nil {
err = ReceiveNats(ms, 1, ctx)
if err != nil {
cclog.Fatal(err)
}
}
}
// InitMetrics initializes the singleton MemoryStore instance with the given metric configurations.
//
// This function must be called before GetMemoryStore() and can only be called once due to
// the singleton pattern. It assigns each metric an internal offset for efficient buffer indexing.
//
// Parameters:
// - metrics: Map of metric names to their configurations (frequency and aggregation strategy)
//
// Panics if any metric has Frequency == 0, which indicates an invalid configuration.
//
// After this call, the global msInstance is ready for use via GetMemoryStore().
func InitMetrics(metrics map[string]MetricConfig) {
singleton.Do(func() {
offset := 0
for key, cfg := range metrics {
if cfg.Frequency == 0 {
panic("[METRICSTORE]> invalid frequency")
}
metrics[key] = MetricConfig{
Frequency: cfg.Frequency,
Aggregation: cfg.Aggregation,
offset: offset,
}
offset += 1
}
msInstance = &MemoryStore{
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
Metrics: metrics,
}
})
}
// GetMemoryStore returns the singleton MemoryStore instance.
//
// Returns the initialized MemoryStore singleton. Calls cclog.Fatal if InitMetrics() was not called first.
//
// This function is safe for concurrent use after initialization.
func GetMemoryStore() *MemoryStore {
if msInstance == nil {
cclog.Fatalf("[METRICSTORE]> MemoryStore not initialized!")
}
return msInstance
}
// SetNodeProvider sets the NodeProvider implementation for the MemoryStore.
// This must be called during initialization to provide job state information
// for selective buffer retention during Free operations.
// If not set, the Free function will fall back to freeing all buffers.
func (ms *MemoryStore) SetNodeProvider(provider NodeProvider) {
ms.nodeProvider = provider
}
// Shutdown performs a graceful shutdown of the metric store.
//
// This function cancels all background goroutines started by Init() and writes
// a final checkpoint to disk before returning. It should be called when the
// application receives a termination signal.
//
// The function will:
// 1. Cancel the context to stop all background workers
// 2. Close NATS message channels if using Avro format
// 3. Write a final checkpoint to preserve in-memory data
// 4. Log any errors encountered during shutdown
//
// Note: This function blocks until the final checkpoint is written.
func Shutdown() {
shutdownFuncMu.Lock()
defer shutdownFuncMu.Unlock()
if shutdownFunc != nil {
shutdownFunc()
}
if Keys.Checkpoints.FileFormat != "json" {
close(LineProtocolMessages)
}
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
var files int
var err error
ms := GetMemoryStore()
if Keys.Checkpoints.FileFormat == "json" {
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
} else {
files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
}
if err != nil {
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
}
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
}
// Retention starts a background goroutine that periodically frees old metric data.
//
// This worker runs at half the retention interval and calls Free() to remove buffers
// older than the configured retention time. It respects the NodeProvider to preserve
// data for nodes with active jobs.
//
// Parameters:
// - wg: WaitGroup to signal completion when context is cancelled
// - ctx: Context for cancellation signal
//
// The goroutine exits when ctx is cancelled.
func Retention(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.RetentionInMemory)
if err != nil {
cclog.Fatal(err)
}
if d <= 0 {
return
}
tickInterval := d / 2
if tickInterval <= 0 {
return
}
ticker := time.NewTicker(tickInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
t := time.Now().Add(-d)
cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
freed, err := Free(ms, t)
if err != nil {
cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
}
}
}
}()
}
// MemoryUsageTracker starts a background goroutine that monitors memory usage.
//
// This worker checks memory usage every minute and force-frees buffers if memory
// exceeds the configured cap. It protects against infinite loops by limiting
// iterations and forcing garbage collection between attempts.
//
// Parameters:
// - wg: WaitGroup to signal completion when context is cancelled
// - ctx: Context for cancellation signal
//
// The goroutine exits when ctx is cancelled.
func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d := 1 * time.Minute
if d <= 0 {
return
}
ticker := time.NewTicker(d)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
memoryUsageGB := ms.SizeInGB()
cclog.Infof("[METRICSTORE]> current memory usage: %.2f GB\n", memoryUsageGB)
if memoryUsageGB > float64(Keys.MemoryCap) {
cclog.Warnf("[METRICSTORE]> current memory usage is greater than the Memory Cap: %d GB\n", Keys.MemoryCap)
cclog.Warnf("[METRICSTORE]> starting to force-free the buffers from the Metric Store\n")
freedTotal := 0
const maxIterations = 100
for range maxIterations {
memoryUsageGB = ms.SizeInGB()
if memoryUsageGB < float64(Keys.MemoryCap) {
break
}
freed, err := ms.ForceFree()
if err != nil {
cclog.Errorf("[METRICSTORE]> error while force-freeing the buffers: %s", err)
}
if freed == 0 {
cclog.Errorf("[METRICSTORE]> 0 buffers force-freed in last try, %d total buffers force-freed, memory usage of %.2f GB remains higher than the memory cap of %d GB and there are no buffers left to force-free\n", freedTotal, memoryUsageGB, Keys.MemoryCap)
break
}
freedTotal += freed
runtime.GC()
}
if memoryUsageGB >= float64(Keys.MemoryCap) {
cclog.Errorf("[METRICSTORE]> reached maximum iterations (%d) or no more buffers to free, current memory usage: %.2f GB\n", maxIterations, memoryUsageGB)
} else {
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freedTotal)
cclog.Infof("[METRICSTORE]> current memory usage after force-freeing the buffers: %.2f GB\n", memoryUsageGB)
}
}
}
}
}()
}
// Free removes metric data older than the given time while preserving data for active nodes.
//
// This function implements intelligent retention by consulting the NodeProvider (if configured)
// to determine which nodes are currently in use by running jobs. Data for these nodes is
// preserved even if older than the retention time.
//
// Parameters:
// - ms: The MemoryStore instance
// - t: Time threshold - buffers with data older than this will be freed
//
// Returns:
// - Number of buffers freed
// - Error if NodeProvider query fails
//
// Behavior:
// - If no NodeProvider is set: frees all buffers older than t
// - If NodeProvider returns empty map: frees all buffers older than t
// - Otherwise: preserves buffers for nodes returned by GetUsedNodes(), frees others
func Free(ms *MemoryStore, t time.Time) (int, error) {
// If no NodeProvider is configured, free all buffers older than t
if ms.nodeProvider == nil {
return ms.Free(nil, t.Unix())
}
excludeSelectors, err := ms.nodeProvider.GetUsedNodes(t.Unix())
if err != nil {
return 0, err
}
switch lenMap := len(excludeSelectors); lenMap {
// If the length of the map returned by GetUsedNodes() is 0,
// then use default Free method with nil selector
case 0:
return ms.Free(nil, t.Unix())
// Else formulate selectors, exclude those from the map
// and free the rest of the selectors
default:
selectors := GetSelectors(ms, excludeSelectors)
return FreeSelected(ms, selectors, t)
}
}
// FreeSelected frees buffers for specific selectors while preserving others.
//
// This function is used when we want to retain some specific nodes beyond the retention time.
// It iterates through the provided selectors and frees their associated buffers.
//
// Parameters:
// - ms: The MemoryStore instance
// - selectors: List of selector paths to free (e.g., [["cluster1", "node1"], ["cluster2", "node2"]])
// - t: Time threshold for freeing buffers
//
// Returns the total number of buffers freed and any error encountered.
func FreeSelected(ms *MemoryStore, selectors [][]string, t time.Time) (int, error) {
freed := 0
for _, selector := range selectors {
freedBuffers, err := ms.Free(selector, t.Unix())
if err != nil {
cclog.Errorf("error while freeing selected buffers: %#v", err)
}
freed += freedBuffers
}
return freed, nil
}
// GetSelectors returns all selectors at depth 2 (cluster/node level) that are NOT in the exclusion map.
//
// This function generates a list of selectors whose buffers should be freed by excluding
// selectors that correspond to nodes currently in use by running jobs.
//
// Parameters:
// - ms: The MemoryStore instance
// - excludeSelectors: Map of cluster names to node hostnames that should NOT be freed
//
// Returns a list of selectors ([]string paths) that can be safely freed.
//
// Example:
//
// If the tree has paths ["emmy", "node001"] and ["emmy", "node002"],
// and excludeSelectors contains {"emmy": ["node001"]},
// then only [["emmy", "node002"]] is returned.
func GetSelectors(ms *MemoryStore, excludeSelectors map[string][]string) [][]string {
allSelectors := ms.GetPaths(2)
filteredSelectors := make([][]string, 0, len(allSelectors))
for _, path := range allSelectors {
if len(path) < 2 {
continue
}
key := path[0] // The "Key" (Level 1)
value := path[1] // The "Value" (Level 2)
exclude := false
// Check if the key exists in our exclusion map
if excludedValues, exists := excludeSelectors[key]; exists {
// The key exists, now check if the specific value is in the exclusion list
if slices.Contains(excludedValues, value) {
exclude = true
}
}
if !exclude {
filteredSelectors = append(filteredSelectors, path)
}
}
return filteredSelectors
}
// GetPaths returns a list of lists (paths) to the specified depth.
func (ms *MemoryStore) GetPaths(targetDepth int) [][]string {
var results [][]string
// Start recursion. Initial path is empty.
// We treat Root as depth 0.
ms.root.collectPaths(0, targetDepth, []string{}, &results)
return results
}
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
// Look at `findLevelOrCreate` for how selectors work.
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
var ok bool
for i, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
metric.MetricConfig, ok = m.Metrics[metric.Name]
if !ok {
cclog.Debugf("[METRICSTORE]> Unknown metric '%s' in Write() - skipping", metric.Name)
metric.MetricConfig.Frequency = 0
}
metrics[i] = metric
}
}
return m.WriteToLevel(&m.root, selector, ts, metrics)
}
func (m *MemoryStore) GetLevel(selector []string) *Level {
return m.root.findLevelOrCreate(selector, len(m.Metrics))
}
// WriteToLevel assumes that `minfo` in `metrics` is filled in
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
l = l.findLevelOrCreate(selector, len(m.Metrics))
l.lock.Lock()
defer l.lock.Unlock()
for _, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
continue
}
b := l.metrics[metric.MetricConfig.offset]
if b == nil {
// First write to this metric and level
b = newBuffer(ts, metric.MetricConfig.Frequency)
l.metrics[metric.MetricConfig.offset] = b
}
nb, err := b.write(ts, metric.Value)
if err != nil {
return err
}
// Last write created a new buffer...
if b != nb {
l.metrics[metric.MetricConfig.offset] = nb
}
}
return nil
}
// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
// The second and third return value are the actual from/to for the data. Those can be different from
// the range asked for if no data was available.
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
if from > to {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
}
minfo, ok := m.Metrics[metric]
if !ok {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> unknown metric: " + metric)
}
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
cdata, cfrom, cto, err := b.read(from, to, data)
if err != nil {
return err
}
if n == 0 {
from, to = cfrom, cto
} else if from != cfrom || to != cto || len(data) != len(cdata) {
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
if missingfront != 0 {
return ErrDataDoesNotAlign
}
newlen := len(cdata) - missingback
if newlen < 1 {
return ErrDataDoesNotAlign
}
cdata = cdata[0:newlen]
if len(cdata) != len(data) {
return ErrDataDoesNotAlign
}
from, to = cfrom, cto
}
data = cdata
n += 1
return nil
})
if err != nil {
return nil, 0, 0, 0, err
} else if n == 0 {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
} else if n > 1 {
if minfo.Aggregation == AvgAggregation {
normalize := 1. / schema.Float(n)
for i := 0; i < len(data); i++ {
data[i] *= normalize
}
} else if minfo.Aggregation != SumAggregation {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
}
}
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
if err != nil {
return nil, 0, 0, 0, err
}
return data, from, to, resolution, nil
}
// Free releases all buffers for the selected level and all its children that
// contain only values older than `t`.
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
return m.GetLevel(selector).free(t)
}
// Free releases all buffers for the selected level and all its children that
// contain only values older than `t`.
func (m *MemoryStore) ForceFree() (int, error) {
return m.GetLevel(nil).forceFree()
}
func (m *MemoryStore) FreeAll() error {
for k := range m.root.children {
delete(m.root.children, k)
}
return nil
}
func (m *MemoryStore) SizeInBytes() int64 {
return m.root.sizeInBytes()
}
func (m *MemoryStore) SizeInGB() float64 {
return float64(m.root.sizeInBytes()) / 1e9
}
// ListChildren , given a selector, returns a list of all children of the level
// selected.
func (m *MemoryStore) ListChildren(selector []string) []string {
lvl := &m.root
for lvl != nil && len(selector) != 0 {
lvl.lock.RLock()
next := lvl.children[selector[0]]
lvl.lock.RUnlock()
lvl = next
selector = selector[1:]
}
if lvl == nil {
return nil
}
lvl.lock.RLock()
defer lvl.lock.RUnlock()
children := make([]string, 0, len(lvl.children))
for child := range lvl.children {
children = append(children, child)
}
return children
}

View File

@@ -3,12 +3,12 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
package metricstore
import (
"testing"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
func TestAssignAggregationStrategy(t *testing.T) {

View File

@@ -3,7 +3,28 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
// This file implements high-level query functions for loading job metric data
// with automatic scope transformation and aggregation.
//
// Key Concepts:
//
// Metric Scopes: Metrics are collected at different granularities (native scope):
// - HWThread: Per hardware thread
// - Core: Per CPU core
// - Socket: Per CPU socket
// - MemoryDomain: Per memory domain (NUMA)
// - Accelerator: Per GPU/accelerator
// - Node: Per compute node
//
// Scope Transformation: The buildQueries functions transform between native scope
// and requested scope by:
// - Aggregating finer-grained data (e.g., HWThread → Core → Socket → Node)
// - Rejecting requests for finer granularity than available
// - Handling special cases (e.g., Accelerator metrics)
//
// Query Building: Constructs APIQuery structures with proper selectors (Type, TypeIds)
// based on cluster topology and job resources.
package metricstore
import (
"context"
@@ -13,10 +34,37 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// TestLoadDataCallback allows tests to override LoadData behavior for testing purposes.
// When set to a non-nil function, LoadData will call this function instead of the default implementation.
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
// LoadData loads metric data for a specific job with automatic scope transformation.
//
// This is the primary function for retrieving job metric data. It handles:
// - Building queries with scope transformation via buildQueries
// - Fetching data from the metric store
// - Organizing results by metric and scope
// - Converting NaN statistics to 0 for JSON compatibility
// - Partial error handling (returns data for successful queries even if some fail)
//
// Parameters:
// - job: Job metadata including cluster, resources, and time range
// - metrics: List of metric names to load
// - scopes: Requested metric scopes (will be transformed to match native scopes)
// - ctx: Context for cancellation (currently unused but reserved for future use)
// - resolution: Data resolution in seconds (0 for native resolution)
//
// Returns:
// - JobData: Map of metric → scope → JobMetric with time-series data and statistics
// - Error: Returns error if query building or fetching fails, or partial error listing failed hosts
//
// Example:
//
// jobData, err := LoadData(job, []string{"cpu_load", "mem_used"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 60)
func LoadData(
job *schema.Job,
metrics []string,
@@ -24,6 +72,10 @@ func LoadData(
ctx context.Context,
resolution int,
) (schema.JobData, error) {
if TestLoadDataCallback != nil {
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
}
queries, assignedScope, err := buildQueries(job, metrics, scopes, int64(resolution))
if err != nil {
cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
@@ -84,12 +136,7 @@ func LoadData(
*id = query.TypeIds[ndx]
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
// "schema.Float()" because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
sanitizeStats(&res)
jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: query.Hostname,
@@ -119,6 +166,10 @@ func LoadData(
return jobData, nil
}
// Pre-converted scope strings avoid repeated string(MetricScope) allocations during
// query construction. These are used in APIQuery.Type field throughout buildQueries
// and buildNodeQueries functions. Converting once at package initialization improves
// performance for high-volume query building.
var (
hwthreadString = string(schema.MetricScopeHWThread)
coreString = string(schema.MetricScopeCore)
@@ -127,12 +178,41 @@ var (
acceleratorString = string(schema.MetricScopeAccelerator)
)
// buildQueries constructs APIQuery structures with automatic scope transformation for a job.
//
// This function implements the core scope transformation logic, handling all combinations of
// native metric scopes and requested scopes. It uses the cluster topology to determine which
// hardware IDs to include in each query.
//
// Scope Transformation Rules:
// - If native scope >= requested scope: Aggregates data (Aggregate=true in APIQuery)
// - If native scope < requested scope: Returns error (cannot increase granularity)
// - Special handling for Accelerator scope (independent of CPU hierarchy)
//
// The function generates one or more APIQuery per (metric, scope, host) combination:
// - For non-aggregated queries: One query with all relevant IDs
// - For aggregated queries: May generate multiple queries (e.g., one per socket/core)
//
// Parameters:
// - job: Job metadata including cluster, subcluster, and resource allocation
// - metrics: List of metrics to query
// - scopes: Requested scopes for each metric
// - resolution: Data resolution in seconds
//
// Returns:
// - []APIQuery: List of queries to execute
// - []schema.MetricScope: Assigned scope for each query (after transformation)
// - error: Returns error if topology lookup fails or unhandled scope combination encountered
func buildQueries(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
resolution int64,
) ([]APIQuery, []schema.MetricScope, error) {
if len(job.Resources) == 0 {
return nil, nil, fmt.Errorf("METRICDATA/CCMS > no resources allocated for job %d", job.JobID)
}
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
assignedScope := []schema.MetricScope{}
@@ -145,7 +225,6 @@ func buildQueries(
for _, metric := range metrics {
mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil {
// return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
cclog.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
continue
}
@@ -164,10 +243,9 @@ func buildQueries(
}
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
// Avoid duplicates using map for O(1) lookup
handledScopes := make(map[schema.MetricScope]bool, 3)
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
@@ -175,12 +253,10 @@ func buildQueries(
}
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
if handledScopes[scope] {
continue
}
handledScopes = append(handledScopes, scope)
handledScopes[scope] = true
for _, host := range job.Resources {
hwthreads := host.HWThreads
@@ -225,7 +301,7 @@ func buildQueries(
continue
}
// HWThread -> HWThead
// HWThread -> HWThread
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
queries = append(queries, APIQuery{
Metric: metric,
@@ -349,7 +425,7 @@ func buildQueries(
continue
}
// MemoryDoman -> Node
// MemoryDomain -> Node
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
queries = append(queries, APIQuery{
@@ -413,12 +489,26 @@ func buildQueries(
return queries, assignedScope, nil
}
// LoadStats loads only metric statistics (avg/min/max) for a job at node scope.
//
// This is an optimized version of LoadData that fetches only statistics without
// time-series data, reducing bandwidth and memory usage. Always queries at node scope.
//
// Parameters:
// - job: Job metadata
// - metrics: List of metric names
// - ctx: Context (currently unused)
//
// Returns:
// - Map of metric → hostname → statistics
// - Error on query building or fetching failure
func LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
queries, _, err := buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization?
// TODO(#166): Add scope parameter for analysis view accelerator normalization
queries, _, err := buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0)
if err != nil {
cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error())
return nil, err
@@ -470,6 +560,20 @@ func LoadStats(
return stats, nil
}
// LoadScopedStats loads metric statistics for a job with scope-aware grouping.
//
// Similar to LoadStats but supports multiple scopes and returns statistics grouped
// by scope with hardware IDs (e.g., per-core, per-socket statistics).
//
// Parameters:
// - job: Job metadata
// - metrics: List of metric names
// - scopes: Requested metric scopes
// - ctx: Context (currently unused)
//
// Returns:
// - ScopedJobStats: Map of metric → scope → []ScopedStats (with hostname and ID)
// - Error or partial error listing failed queries
func LoadScopedStats(
job *schema.Job,
metrics []string,
@@ -526,12 +630,7 @@ func LoadScopedStats(
*id = query.TypeIds[ndx]
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
// "schema.Float()" because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
sanitizeStats(&res)
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: query.Hostname,
@@ -560,6 +659,22 @@ func LoadScopedStats(
return scopedJobStats, nil
}
// LoadNodeData loads metric data for specific nodes in a cluster over a time range.
//
// Unlike LoadData which operates on job resources, this function queries arbitrary nodes
// directly. Useful for system monitoring and node status views.
//
// Parameters:
// - cluster: Cluster name
// - metrics: List of metric names
// - nodes: List of node hostnames (nil = all nodes in cluster via ForAllNodes)
// - scopes: Requested metric scopes (currently unused - always node scope)
// - from, to: Time range
// - ctx: Context (currently unused)
//
// Returns:
// - Map of hostname → metric → []JobMetric
// - Error or partial error listing failed queries
func LoadNodeData(
cluster string,
metrics, nodes []string,
@@ -608,14 +723,10 @@ func LoadNodeData(
metric := query.Metric
qdata := res[0]
if qdata.Error != nil {
/* Build list for "partial errors", if any */
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
}
if qdata.Avg.IsNaN() || qdata.Min.IsNaN() || qdata.Max.IsNaN() {
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
qdata.Avg, qdata.Min, qdata.Max = 0., 0., 0.
}
sanitizeStats(&qdata)
hostdata, ok := data[query.Hostname]
if !ok {
@@ -649,6 +760,24 @@ func LoadNodeData(
return data, nil
}
// LoadNodeListData loads metric data for a list of nodes with full scope transformation support.
//
// This is the most flexible node data loading function, supporting arbitrary scopes and
// resolution. Uses buildNodeQueries for proper scope transformation based on topology.
//
// Parameters:
// - cluster: Cluster name
// - subCluster: SubCluster name (empty string to infer from node names)
// - nodes: List of node hostnames
// - metrics: List of metric names
// - scopes: Requested metric scopes
// - resolution: Data resolution in seconds
// - from, to: Time range
// - ctx: Context (currently unused)
//
// Returns:
// - Map of hostname → JobData (metric → scope → JobMetric)
// - Error or partial error listing failed queries
func LoadNodeListData(
cluster, subCluster string,
nodes []string,
@@ -689,7 +818,7 @@ func LoadNodeListData(
} else {
query = req.Queries[i]
}
// qdata := res[0]
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(cluster, metric)
@@ -735,12 +864,7 @@ func LoadNodeListData(
*id = query.TypeIds[ndx]
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
// "schema.Float()" because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
sanitizeStats(&res)
scopeData.Series = append(scopeData.Series, schema.Series{
Hostname: query.Hostname,
@@ -763,6 +887,23 @@ func LoadNodeListData(
return data, nil
}
// buildNodeQueries constructs APIQuery structures for node-based queries with scope transformation.
//
// Similar to buildQueries but operates on node lists rather than job resources.
// Supports dynamic subcluster lookup when subCluster parameter is empty.
//
// Parameters:
// - cluster: Cluster name
// - subCluster: SubCluster name (empty = infer from node hostnames)
// - nodes: List of node hostnames
// - metrics: List of metric names
// - scopes: Requested metric scopes
// - resolution: Data resolution in seconds
//
// Returns:
// - []APIQuery: List of queries to execute
// - []schema.MetricScope: Assigned scope for each query
// - error: Returns error if topology lookup fails or unhandled scope combination
func buildNodeQueries(
cluster string,
subCluster string,
@@ -771,6 +912,10 @@ func buildNodeQueries(
scopes []schema.MetricScope,
resolution int64,
) ([]APIQuery, []schema.MetricScope, error) {
if len(nodes) == 0 {
return nil, nil, fmt.Errorf("METRICDATA/CCMS > no nodes specified for query")
}
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
assignedScope := []schema.MetricScope{}
@@ -788,7 +933,6 @@ func buildNodeQueries(
for _, metric := range metrics {
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
// return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, cluster)
cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster)
continue
}
@@ -807,20 +951,17 @@ func buildNodeQueries(
}
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
// Avoid duplicates using map for O(1) lookup
handledScopes := make(map[schema.MetricScope]bool, 3)
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
if handledScopes[scope] {
continue
}
handledScopes = append(handledScopes, scope)
handledScopes[scope] = true
for _, hostname := range nodes {
@@ -843,7 +984,7 @@ func buildNodeQueries(
// Moved check here if metric matches hardware specs
if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
continue scopesLoop
continue
}
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
@@ -883,7 +1024,7 @@ func buildNodeQueries(
continue
}
// HWThread -> HWThead
// HWThread -> HWThread
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
queries = append(queries, APIQuery{
Metric: metric,
@@ -1007,7 +1148,7 @@ func buildNodeQueries(
continue
}
// MemoryDoman -> Node
// MemoryDomain -> Node
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node)
queries = append(queries, APIQuery{
@@ -1071,10 +1212,37 @@ func buildNodeQueries(
return queries, assignedScope, nil
}
// sanitizeStats converts NaN statistics to zero for JSON compatibility.
//
// schema.Float with NaN values cannot be properly JSON-encoded, so we convert
// NaN to 0. This loses the distinction between "no data" and "zero value",
// but maintains API compatibility.
func sanitizeStats(data *APIMetricData) {
if data.Avg.IsNaN() {
data.Avg = schema.Float(0)
}
if data.Min.IsNaN() {
data.Min = schema.Float(0)
}
if data.Max.IsNaN() {
data.Max = schema.Float(0)
}
}
// intToStringSlice converts a slice of integers to a slice of strings.
// Used to convert hardware thread/core/socket IDs from topology (int) to APIQuery TypeIds (string).
//
// Optimized to reuse a byte buffer for string conversion, reducing allocations.
func intToStringSlice(is []int) []string {
if len(is) == 0 {
return nil
}
ss := make([]string, len(is))
buf := make([]byte, 0, 16) // Reusable buffer for integer conversion
for i, x := range is {
ss[i] = strconv.Itoa(x)
buf = strconv.AppendInt(buf[:0], int64(x), 10)
ss[i] = string(buf)
}
return ss
}

View File

@@ -3,13 +3,13 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
package metricstore
import (
"errors"
"math"
"github.com/ClusterCockpit/cc-lib/util"
"github.com/ClusterCockpit/cc-lib/v2/util"
)
type Stats struct {

View File

@@ -12,7 +12,7 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/jmoiron/sqlx"
"github.com/mattn/go-sqlite3"
"github.com/qustavo/sqlhooks/v2"
@@ -115,3 +115,26 @@ func GetConnection() *DBConnection {
return dbConnInstance
}
// ResetConnection closes the current database connection and resets the connection state.
// This function is intended for testing purposes only to allow test isolation.
func ResetConnection() error {
if dbConnInstance != nil && dbConnInstance.DB != nil {
if err := dbConnInstance.DB.Close(); err != nil {
return fmt.Errorf("failed to close database connection: %w", err)
}
}
dbConnInstance = nil
dbConnOnce = sync.Once{}
jobRepoInstance = nil
jobRepoOnce = sync.Once{}
nodeRepoInstance = nil
nodeRepoOnce = sync.Once{}
userRepoInstance = nil
userRepoOnce = sync.Once{}
userCfgRepoInstance = nil
userCfgRepoOnce = sync.Once{}
return nil
}

Some files were not shown because too many files have changed in this diff Show More