98 Commits

Author SHA1 Message Date
Christoph Kluge
55d2c7d7eb Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-16 09:32:05 +01:00
Christoph Kluge
bb527fb410 bump frontend dependencies 2026-01-16 09:32:02 +01:00
9a97d0e8eb Add documentation 2026-01-16 08:27:46 +01:00
93dcfee8c5 Document and refactor 2026-01-16 08:27:15 +01:00
76139ef53c Remove now optional apiAllowedIPs option 2026-01-16 08:23:31 +01:00
Aditya Ujeniya
32319adf72 Add Memory Tracker worker for CCMS 2026-01-15 21:29:21 +01:00
Aditya Ujeniya
10a5c89a16 Fix logic for findFiles() and keep archive worker 2026-01-15 20:27:11 +01:00
Christoph Kluge
40bff1eff9 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-15 18:18:54 +01:00
Christoph Kluge
ceba4eb0c6 Review dashboards, add twosided progress to indernal dash 2026-01-15 18:18:51 +01:00
Christoph Kluge
faacf3f343 svelte state_referenced_locally warning fixes
- change to derived where possible
- suppress warning elsewhere
- discussion here: sveltejs/svelte/issues/17289
2026-01-15 18:17:45 +01:00
Aditya Ujeniya
7cd98c4f25 Test and update files for dynamic retention 2026-01-15 17:48:59 +01:00
Michael Panzlaff
489ad44b9f Make apiAllowedIPs optional
If our test and production instance just use *, one might as well make
that the default value. This should ease configuration for minimal
setups.
2026-01-15 16:08:29 +01:00
7db2bbe6b0 Add job tagger option to example config 2026-01-15 15:53:54 +01:00
b6f0faa97f Make polarplot default in Jobview 2026-01-15 15:47:40 +01:00
a3fffa8e8b Update example and demo config 2026-01-15 13:57:15 +01:00
72248defbf Cleanup print statements. Always enable Compression 2026-01-15 13:39:22 +01:00
155e05495e Fix shutdown timout bug 2026-01-15 13:29:19 +01:00
9c92a7796b Introduce nodeprovider interface to break import cycle 2026-01-15 12:20:11 +01:00
7c78407c49 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-15 11:34:09 +01:00
cb219b3c74 Fix configuration issues. Fix shutdown hangs
Always turn on compression
2026-01-15 11:34:06 +01:00
d59aa2e855 Restructure configuration with sensible defaults. Fix shutdown hangs 2026-01-15 11:33:01 +01:00
Christoph Kluge
cd3d133f0d Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-15 10:12:02 +01:00
Christoph Kluge
3b7fc44ce9 specify job count return logs, move to debug level 2026-01-15 10:11:31 +01:00
e1efc68476 Update dependencies. Rebuild graphql and swagger 2026-01-15 08:32:06 +01:00
8f0bb907ff Improve documentation and add more tests 2026-01-15 06:41:23 +01:00
Christoph Kluge
e5c620ca20 filter metrics for NodeMetrics query 2026-01-14 19:22:31 +01:00
Christoph Kluge
d0bcfb90e6 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-14 17:18:25 +01:00
Christoph Kluge
9deee54e41 make nextPage query conditional if no return limit is requested 2026-01-14 17:18:21 +01:00
Michael Panzlaff
94b86ef11a Mismatched event types are not something to be concerned about
If a different CCMessage type was sent over the same subject as
requested, that shouldn't raise a warning. This may happen in production
instances, but in order to ease debugging, lower it to 'debug' level.
2026-01-14 16:08:33 +01:00
Christoph Kluge
d8cd752dcb Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-14 16:03:39 +01:00
Christoph Kluge
5d376e6865 fix clustername array init size 2026-01-14 16:03:34 +01:00
9c3beddf54 Improve documentation 2026-01-14 15:39:38 +01:00
c6465ad9e5 Add s3 configuration options 2026-01-14 15:28:34 +01:00
d415381d4a Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-14 15:10:25 +01:00
211d4fae54 Refactor s3 backend and suppress checksum warning 2026-01-14 15:10:20 +01:00
Aditya Ujeniya
3276ed7785 Half-baked commit for new dynamic retention logic 2026-01-14 14:56:36 +01:00
Christoph Kluge
77b7548ef3 fix wrong varname 2026-01-14 13:00:36 +01:00
Christoph Kluge
59851f410e Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-14 11:25:43 +01:00
Christoph Kluge
4cb8d648cb adapt frontend to backend config changes, clarify variable names 2026-01-14 11:25:40 +01:00
c8627a13f4 Remove obsolete slusters config section 2026-01-14 11:17:49 +01:00
0ea0270fe1 Reintroduce Clusters as string list of cluster names 2026-01-14 10:37:07 +01:00
19402d30af Review and improve error messages and doc comments 2026-01-14 10:09:19 +01:00
b2f870e3c0 Convert nodestate nats API to influx line protocol payload. Review and add doc comments.
Improve and extend tests
2026-01-14 10:08:06 +01:00
9e542dc200 Review and improve, add documentation 2026-01-14 09:26:03 +01:00
6cf59043a3 Review and improve, add documentation 2026-01-14 08:59:27 +01:00
71b75eea0e Improve GetUsedNodes function 2026-01-14 08:49:55 +01:00
e900a686db Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-14 07:38:04 +01:00
fb8db3c3ae Add query which node metric data needs to be retained 2026-01-14 07:37:31 +01:00
Christoph Kluge
170a9ace8a Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-13 16:59:57 +01:00
Christoph Kluge
518e9950ea add job exclusivity filter, review db indices 2026-01-13 16:59:52 +01:00
25c8fca561 Revert retention config in metricstore 2026-01-13 14:42:24 +01:00
754f7e16f6 Reformat with gofumpt 2026-01-13 09:52:31 +01:00
04a2e460ae Refactor metricstore. Initial stub for cluster/ subcluster specific retention times 2026-01-13 09:52:00 +01:00
2ebab1e2e2 Reformat with gofumpt 2026-01-13 09:50:57 +01:00
a9366d14c6 Add README for tagging. Enable tagging by flag without configuration option 2026-01-13 08:32:32 +01:00
42809e3f75 Remove embedded tagger rules 2026-01-13 07:20:26 +01:00
4cec933349 Remove obsolete cluster config section 2026-01-13 06:28:33 +01:00
d3f3c532b1 Merge branch 'master' into dev 2026-01-12 11:18:56 +01:00
ad1e87d0b8 Disable dependabot alerts 2026-01-12 11:17:44 +01:00
Jan Eitzinger
affa85c086 Merge pull request #469 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2/credentials-1.19.7
Bump github.com/aws/aws-sdk-go-v2/credentials from 1.19.6 to 1.19.7
2026-01-12 10:30:35 +01:00
Jan Eitzinger
aa053d78f7 Merge pull request #470 from ClusterCockpit/dependabot/go_modules/github.com/mattn/go-sqlite3-1.14.33
Bump github.com/mattn/go-sqlite3 from 1.14.32 to 1.14.33
2026-01-12 10:30:16 +01:00
dependabot[bot]
fae6d9d835 Bump github.com/mattn/go-sqlite3 from 1.14.32 to 1.14.33
Bumps [github.com/mattn/go-sqlite3](https://github.com/mattn/go-sqlite3) from 1.14.32 to 1.14.33.
- [Release notes](https://github.com/mattn/go-sqlite3/releases)
- [Commits](https://github.com/mattn/go-sqlite3/compare/v1.14.32...v1.14.33)

---
updated-dependencies:
- dependency-name: github.com/mattn/go-sqlite3
  dependency-version: 1.14.33
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-12 08:52:44 +00:00
dependabot[bot]
78f1db7ad1 Bump github.com/aws/aws-sdk-go-v2/credentials from 1.19.6 to 1.19.7
Bumps [github.com/aws/aws-sdk-go-v2/credentials](https://github.com/aws/aws-sdk-go-v2) from 1.19.6 to 1.19.7.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/service/m2/v1.19.6...service/m2/v1.19.7)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2/credentials
  dependency-version: 1.19.7
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-12 08:52:40 +00:00
f1367f84f8 Merge branch 'master' into dev 2026-01-12 09:14:45 +01:00
Jan Eitzinger
4c81696f4d Merge pull request #455 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/rollup-4.54.0
Bump rollup from 4.53.3 to 4.54.0 in /web/frontend
2026-01-12 09:09:42 +01:00
Jan Eitzinger
a91f8f72e3 Merge pull request #459 from ClusterCockpit/dependabot/go_modules/golang.org/x/oauth2-0.34.0
Bump golang.org/x/oauth2 from 0.32.0 to 0.34.0
2026-01-12 09:09:18 +01:00
Jan Eitzinger
87f7ed329c Merge pull request #461 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/svelte-5.46.1
Bump svelte from 5.44.0 to 5.46.1 in /web/frontend
2026-01-12 09:08:49 +01:00
dependabot[bot]
8641d9053d Bump golang.org/x/oauth2 from 0.32.0 to 0.34.0
Bumps [golang.org/x/oauth2](https://github.com/golang/oauth2) from 0.32.0 to 0.34.0.
- [Commits](https://github.com/golang/oauth2/compare/v0.32.0...v0.34.0)

---
updated-dependencies:
- dependency-name: golang.org/x/oauth2
  dependency-version: 0.34.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-12 08:07:20 +00:00
Jan Eitzinger
4a5ab8a279 Merge pull request #462 from ClusterCockpit/dependabot/go_modules/github.com/99designs/gqlgen-0.17.85
Bump github.com/99designs/gqlgen from 0.17.84 to 0.17.85
2026-01-12 09:06:18 +01:00
Jan Eitzinger
d179412ab6 Merge pull request #463 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2/service/s3-1.95.0
Bump github.com/aws/aws-sdk-go-v2/service/s3 from 1.90.2 to 1.95.0
2026-01-12 09:05:50 +01:00
Jan Eitzinger
968c7d179d Merge pull request #464 from ClusterCockpit/dependabot/go_modules/github.com/go-co-op/gocron/v2-2.19.0
Bump github.com/go-co-op/gocron/v2 from 2.18.2 to 2.19.0
2026-01-12 09:05:29 +01:00
56399523d7 Update module deps 2026-01-12 09:00:06 +01:00
4d6326b8be Remove metricsync 2026-01-12 08:55:31 +01:00
dependabot[bot]
a2414791bf Bump github.com/go-co-op/gocron/v2 from 2.18.2 to 2.19.0
Bumps [github.com/go-co-op/gocron/v2](https://github.com/go-co-op/gocron) from 2.18.2 to 2.19.0.
- [Release notes](https://github.com/go-co-op/gocron/releases)
- [Commits](https://github.com/go-co-op/gocron/compare/v2.18.2...v2.19.0)

---
updated-dependencies:
- dependency-name: github.com/go-co-op/gocron/v2
  dependency-version: 2.19.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:05:04 +00:00
dependabot[bot]
faf3a19f0c Bump github.com/aws/aws-sdk-go-v2/service/s3 from 1.90.2 to 1.95.0
Bumps [github.com/aws/aws-sdk-go-v2/service/s3](https://github.com/aws/aws-sdk-go-v2) from 1.90.2 to 1.95.0.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/service/s3/v1.90.2...service/s3/v1.95.0)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2/service/s3
  dependency-version: 1.95.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:04:58 +00:00
dependabot[bot]
4e6038d6c1 Bump github.com/99designs/gqlgen from 0.17.84 to 0.17.85
Bumps [github.com/99designs/gqlgen](https://github.com/99designs/gqlgen) from 0.17.84 to 0.17.85.
- [Release notes](https://github.com/99designs/gqlgen/releases)
- [Changelog](https://github.com/99designs/gqlgen/blob/master/CHANGELOG.md)
- [Commits](https://github.com/99designs/gqlgen/compare/v0.17.84...v0.17.85)

---
updated-dependencies:
- dependency-name: github.com/99designs/gqlgen
  dependency-version: 0.17.85
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:03:41 +00:00
dependabot[bot]
ddc2ecf829 Bump svelte from 5.44.0 to 5.46.1 in /web/frontend
Bumps [svelte](https://github.com/sveltejs/svelte/tree/HEAD/packages/svelte) from 5.44.0 to 5.46.1.
- [Release notes](https://github.com/sveltejs/svelte/releases)
- [Changelog](https://github.com/sveltejs/svelte/blob/main/packages/svelte/CHANGELOG.md)
- [Commits](https://github.com/sveltejs/svelte/commits/svelte@5.46.1/packages/svelte)

---
updated-dependencies:
- dependency-name: svelte
  dependency-version: 5.46.1
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:02:46 +00:00
ecb5aef735 Fix build error in unit test 2025-12-25 08:48:03 +01:00
11ec2267da Major refactor of metric data handling
- make the  internal memory store required and default
- Rename memorystore to metricstore
- Rename metricDataDispatcher to metricdispatch
- Remove metricdata package
- Introduce metricsync package for upstream metric data pull
2025-12-25 08:42:54 +01:00
8576ae458d Switch to cc-lib v2 2025-12-24 09:24:18 +01:00
Jan Eitzinger
c66445acb5 Merge pull request #458 from ClusterCockpit/dependabot/go_modules/github.com/expr-lang/expr-1.17.7
Bump github.com/expr-lang/expr from 1.17.6 to 1.17.7
2025-12-23 21:16:13 +01:00
dependabot[bot]
29a20f7b0b Bump github.com/expr-lang/expr from 1.17.6 to 1.17.7
Bumps [github.com/expr-lang/expr](https://github.com/expr-lang/expr) from 1.17.6 to 1.17.7.
- [Release notes](https://github.com/expr-lang/expr/releases)
- [Commits](https://github.com/expr-lang/expr/compare/v1.17.6...v1.17.7)

---
updated-dependencies:
- dependency-name: github.com/expr-lang/expr
  dependency-version: 1.17.7
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-23 09:07:01 +00:00
Jan Eitzinger
874c019fb6 Merge pull request #457 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2/config-1.32.6
Bump github.com/aws/aws-sdk-go-v2/config from 1.31.20 to 1.32.6
2025-12-23 10:06:17 +01:00
Jan Eitzinger
54825626de Merge pull request #456 from ClusterCockpit/dependabot/go_modules/github.com/coreos/go-oidc/v3-3.17.0
Bump github.com/coreos/go-oidc/v3 from 3.16.0 to 3.17.0
2025-12-23 10:05:56 +01:00
9bf5c5dc1a Update README and config schema 2025-12-23 09:34:09 +01:00
64fef9774c Add unit test for NATS API 2025-12-23 09:22:57 +01:00
999667ec0c Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-23 07:56:16 +01:00
c1135531ba Port NATS api to ccMessages 2025-12-23 07:56:13 +01:00
287256e5f1 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-23 05:56:49 +01:00
0bc26aa194 Add error check 2025-12-23 05:56:46 +01:00
Christoph Kluge
502d7e9084 Rework info panel in public dashboard
- change to bootstrap grid from table
- add infos, use badges
- remove non required query
2025-12-22 17:26:56 +01:00
Christoph Kluge
89875db4a9 dashboard layout fixes 2025-12-22 10:39:40 +01:00
dependabot[bot]
5a8b929448 Bump github.com/aws/aws-sdk-go-v2/config from 1.31.20 to 1.32.6
Bumps [github.com/aws/aws-sdk-go-v2/config](https://github.com/aws/aws-sdk-go-v2) from 1.31.20 to 1.32.6.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/config/v1.31.20...v1.32.6)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2/config
  dependency-version: 1.32.6
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-22 08:04:43 +00:00
dependabot[bot]
fe78f2f433 Bump github.com/coreos/go-oidc/v3 from 3.16.0 to 3.17.0
Bumps [github.com/coreos/go-oidc/v3](https://github.com/coreos/go-oidc) from 3.16.0 to 3.17.0.
- [Release notes](https://github.com/coreos/go-oidc/releases)
- [Commits](https://github.com/coreos/go-oidc/compare/v3.16.0...v3.17.0)

---
updated-dependencies:
- dependency-name: github.com/coreos/go-oidc/v3
  dependency-version: 3.17.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-22 08:03:31 +00:00
dependabot[bot]
e37591ce6d Bump rollup from 4.53.3 to 4.54.0 in /web/frontend
Bumps [rollup](https://github.com/rollup/rollup) from 4.53.3 to 4.54.0.
- [Release notes](https://github.com/rollup/rollup/releases)
- [Changelog](https://github.com/rollup/rollup/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rollup/rollup/compare/v4.53.3...v4.54.0)

---
updated-dependencies:
- dependency-name: rollup
  dependency-version: 4.54.0
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-22 08:02:41 +00:00
Jan Eitzinger
998f800632 Merge pull request #454 from ClusterCockpit/dev
Dev
2025-12-18 15:52:06 +01:00
Jan Eitzinger
10a0b0add8 Merge pull request #452 from ClusterCockpit/dev
Dev
2025-12-18 11:28:07 +01:00
Jan Eitzinger
f9aa47ea1c Merge pull request #450 from ClusterCockpit/dev
Dev
2025-12-15 14:42:26 +01:00
213 changed files with 11807 additions and 5891 deletions

View File

@@ -1,15 +0,0 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
version: 2
updates:
- package-ecosystem: "gomod"
directory: "/"
schedule:
interval: "weekly"
- package-ecosystem: "npm"
directory: "/web/frontend"
schedule:
interval: "weekly"

104
CLAUDE.md
View File

@@ -96,15 +96,19 @@ The backend follows a layered architecture with clear separation of concerns:
- **internal/auth**: Authentication layer - **internal/auth**: Authentication layer
- Supports local accounts, LDAP, OIDC, and JWT tokens - Supports local accounts, LDAP, OIDC, and JWT tokens
- Implements rate limiting for login attempts - Implements rate limiting for login attempts
- **internal/metricdata**: Metric data repository abstraction - **internal/metricstore**: Metric store with data loading API
- Pluggable backends: cc-metric-store, Prometheus, InfluxDB - In-memory metric storage with checkpointing
- Each cluster can have a different metric data backend - Query API for loading job metric data
- **internal/archiver**: Job archiving to file-based archive - **internal/archiver**: Job archiving to file-based archive
- **internal/api/nats.go**: NATS-based API for job and node operations
- Subscribes to NATS subjects for job events (start/stop)
- Handles node state updates via NATS
- Uses InfluxDB line protocol message format
- **pkg/archive**: Job archive backend implementations - **pkg/archive**: Job archive backend implementations
- File system backend (default) - File system backend (default)
- S3 backend - S3 backend
- SQLite backend (experimental) - SQLite backend (experimental)
- **pkg/nats**: NATS integration for metric ingestion - **pkg/nats**: NATS client and message decoding utilities
### Frontend Structure ### Frontend Structure
@@ -146,6 +150,14 @@ applied automatically on startup. Version tracking in `version` table.
## Configuration ## Configuration
- **config.json**: Main configuration (clusters, metric repositories, archive settings) - **config.json**: Main configuration (clusters, metric repositories, archive settings)
- `main.apiSubjects`: NATS subject configuration (optional)
- `subjectJobEvent`: Subject for job start/stop events (e.g., "cc.job.event")
- `subjectNodeState`: Subject for node state updates (e.g., "cc.node.state")
- `nats`: NATS client connection configuration (optional)
- `address`: NATS server address (e.g., "nats://localhost:4222")
- `username`: Authentication username (optional)
- `password`: Authentication password (optional)
- `creds-file-path`: Path to NATS credentials file (optional)
- **.env**: Environment variables (secrets like JWT keys) - **.env**: Environment variables (secrets like JWT keys)
- Copy from `configs/env-template.txt` - Copy from `configs/env-template.txt`
- NEVER commit this file - NEVER commit this file
@@ -197,8 +209,8 @@ applied automatically on startup. Version tracking in `version` table.
### Adding a new metric data backend ### Adding a new metric data backend
1. Implement `MetricDataRepository` interface in `internal/metricdata/` 1. Implement metric loading functions in `internal/metricstore/query.go`
2. Register in `metricdata.Init()` switch statement 2. Add cluster configuration to metric store initialization
3. Update config.json schema documentation 3. Update config.json schema documentation
### Modifying database schema ### Modifying database schema
@@ -207,9 +219,87 @@ applied automatically on startup. Version tracking in `version` table.
2. Increment `repository.Version` 2. Increment `repository.Version`
3. Test with fresh database and existing database 3. Test with fresh database and existing database
## NATS API
The backend supports a NATS-based API as an alternative to the REST API for job and node operations.
### Setup
1. Configure NATS client connection in `config.json`:
```json
{
"nats": {
"address": "nats://localhost:4222",
"username": "user",
"password": "pass"
}
}
```
2. Configure API subjects in `config.json` under `main`:
```json
{
"main": {
"apiSubjects": {
"subjectJobEvent": "cc.job.event",
"subjectNodeState": "cc.node.state"
}
}
}
```
### Message Format
Messages use **InfluxDB line protocol** format with the following structure:
#### Job Events
**Start Job:**
```
job,function=start_job event="{\"jobId\":123,\"user\":\"alice\",\"cluster\":\"test\", ...}" 1234567890000000000
```
**Stop Job:**
```
job,function=stop_job event="{\"jobId\":123,\"cluster\":\"test\",\"startTime\":1234567890,\"stopTime\":1234571490,\"jobState\":\"completed\"}" 1234571490000000000
```
**Tags:**
- `function`: Either `start_job` or `stop_job`
**Fields:**
- `event`: JSON payload containing job data (see REST API documentation for schema)
#### Node State Updates
```json
{
"cluster": "testcluster",
"nodes": [
{
"hostname": "node001",
"states": ["allocated"],
"cpusAllocated": 8,
"memoryAllocated": 16384,
"gpusAllocated": 0,
"jobsRunning": 1
}
]
}
```
### Implementation Notes
- NATS API mirrors REST API functionality but uses messaging
- Job start/stop events are processed asynchronously
- Duplicate job detection is handled (same as REST API)
- All validation rules from REST API apply
- Messages are logged; no responses are sent back to publishers
- If NATS client is unavailable, API subscriptions are skipped (logged as warning)
## Dependencies ## Dependencies
- Go 1.24.0+ (check go.mod for exact version) - Go 1.24.0+ (check go.mod for exact version)
- Node.js (for frontend builds) - Node.js (for frontend builds)
- SQLite 3 (only supported database) - SQLite 3 (only supported database)
- Optional: NATS server for metric ingestion - Optional: NATS server for NATS API integration

View File

@@ -22,11 +22,12 @@ switching from PHP Symfony to a Golang based solution are explained
## Overview ## Overview
This is a Golang web backend for the ClusterCockpit job-specific performance This is a Golang web backend for the ClusterCockpit job-specific performance
monitoring framework. It provides a REST API for integrating ClusterCockpit with monitoring framework. It provides a REST API and an optional NATS-based messaging
an HPC cluster batch system and external analysis scripts. Data exchange between API for integrating ClusterCockpit with an HPC cluster batch system and external
the web front-end and the back-end is based on a GraphQL API. The web frontend analysis scripts. Data exchange between the web front-end and the back-end is
is also served by the backend using [Svelte](https://svelte.dev/) components. based on a GraphQL API. The web frontend is also served by the backend using
Layout and styling are based on [Bootstrap 5](https://getbootstrap.com/) using [Svelte](https://svelte.dev/) components. Layout and styling are based on
[Bootstrap 5](https://getbootstrap.com/) using
[Bootstrap Icons](https://icons.getbootstrap.com/). [Bootstrap Icons](https://icons.getbootstrap.com/).
The backend uses [SQLite 3](https://sqlite.org/) as the relational SQL database. The backend uses [SQLite 3](https://sqlite.org/) as the relational SQL database.
@@ -35,6 +36,10 @@ databases, the only tested and supported setup is to use cc-metric-store as the
metric data backend. Documentation on how to integrate ClusterCockpit with other metric data backend. Documentation on how to integrate ClusterCockpit with other
time series databases will be added in the future. time series databases will be added in the future.
For real-time integration with HPC systems, the backend can subscribe to
[NATS](https://nats.io/) subjects to receive job start/stop events and node
state updates, providing an alternative to REST API polling.
Completed batch jobs are stored in a file-based job archive according to Completed batch jobs are stored in a file-based job archive according to
[this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive). [this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
The backend supports authentication via local accounts, an external LDAP The backend supports authentication via local accounts, an external LDAP
@@ -130,27 +135,58 @@ ln -s <your-existing-job-archive> ./var/job-archive
## Project file structure ## Project file structure
- [`.github/`](https://github.com/ClusterCockpit/cc-backend/tree/master/.github)
GitHub Actions workflows and dependabot configuration for CI/CD.
- [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api) - [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
contains the API schema files for the REST and GraphQL APIs. The REST API is contains the API schema files for the REST and GraphQL APIs. The REST API is
documented in the OpenAPI 3.0 format in documented in the OpenAPI 3.0 format in
[./api/openapi.yaml](./api/openapi.yaml). [./api/swagger.yaml](./api/swagger.yaml). The GraphQL schema is in
[./api/schema.graphqls](./api/schema.graphqls).
- [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend) - [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
contains `main.go` for the main application. contains the main application entry point and CLI implementation.
- [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs) - [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
contains documentation about configuration and command line options and required contains documentation about configuration and command line options and required
environment variables. A sample configuration file is provided. environment variables. Sample configuration files are provided.
- [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs)
contains more in-depth documentation.
- [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init) - [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
contains an example of setting up systemd for production use. contains an example of setting up systemd for production use.
- [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal) - [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
contains library source code that is not intended for use by others. contains library source code that is not intended for use by others.
- [`api`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/api)
REST API handlers and NATS integration
- [`archiver`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/archiver)
Job archiving functionality
- [`auth`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/auth)
Authentication (local, LDAP, OIDC) and JWT token handling
- [`config`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/config)
Configuration management and validation
- [`graph`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/graph)
GraphQL schema and resolvers
- [`importer`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/importer)
Job data import and database initialization
- [`metricstore`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricstore)
In-memory metric data store with checkpointing and metric loading
- [`metricdispatch`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricdispatch)
Dispatches metric data loading to appropriate backends
- [`repository`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/repository)
Database repository layer for jobs and metadata
- [`routerConfig`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/routerConfig)
HTTP router configuration and middleware
- [`tagger`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/tagger)
Job classification and application detection
- [`taskmanager`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/taskmanager)
Background task management and scheduled jobs
- [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg) - [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
contains Go packages that can be used by other projects. contains Go packages that can be used by other projects.
- [`archive`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/archive)
Job archive backend implementations (filesystem, S3)
- [`nats`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/nats)
NATS client and message handling
- [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools) - [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
Additional command line helper tools. Additional command line helper tools.
- [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager) - [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
Commands for getting infos about and existing job archive. Commands for getting infos about an existing job archive.
- [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
Tool for migrating job archives between formats.
- [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey) - [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
Tool to convert external pubkey for use in `cc-backend`. Tool to convert external pubkey for use in `cc-backend`.
- [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair) - [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
@@ -162,7 +198,7 @@ ln -s <your-existing-job-archive> ./var/job-archive
- [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend) - [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
Svelte components and static assets for the frontend UI Svelte components and static assets for the frontend UI
- [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates) - [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
Server-side Go templates Server-side Go templates, including monitoring views
- [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml) - [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
Configures the behaviour and generation of Configures the behaviour and generation of
[gqlgen](https://github.com/99designs/gqlgen). [gqlgen](https://github.com/99designs/gqlgen).

View File

@@ -458,6 +458,7 @@ input JobFilter {
state: [JobState!] state: [JobState!]
metricStats: [MetricStatItem!] metricStats: [MetricStatItem!]
shared: String shared: String
schedule: String
node: StringInput node: StringInput
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -15,8 +15,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/util" "github.com/ClusterCockpit/cc-lib/v2/util"
) )
const envString = ` const envString = `
@@ -36,7 +36,7 @@ const configString = `
"short-running-jobs-duration": 300, "short-running-jobs-duration": 300,
"resampling": { "resampling": {
"minimumPoints": 600, "minimumPoints": 600,
"trigger": 180, "trigger": 300,
"resolutions": [ "resolutions": [
240, 240,
60 60
@@ -48,7 +48,7 @@ const configString = `
"emission-constant": 317 "emission-constant": 317
}, },
"cron": { "cron": {
"commit-job-worker": "2m", "commit-job-worker": "1m",
"duration-worker": "5m", "duration-worker": "5m",
"footprint-worker": "10m" "footprint-worker": "10m"
}, },
@@ -60,31 +60,7 @@ const configString = `
"jwts": { "jwts": {
"max-age": "2000h" "max-age": "2000h"
} }
},
"clusters": [
{
"name": "name",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2023-01-01T00:00:00Z",
"to": null
} }
}
}
]
} }
` `

View File

@@ -24,19 +24,18 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/importer" "github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/tagger" "github.com/ClusterCockpit/cc-backend/internal/tagger"
"github.com/ClusterCockpit/cc-backend/internal/taskmanager" "github.com/ClusterCockpit/cc-backend/internal/taskmanager"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/nats" "github.com/ClusterCockpit/cc-backend/pkg/nats"
"github.com/ClusterCockpit/cc-backend/web" "github.com/ClusterCockpit/cc-backend/web"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/runtimeEnv" "github.com/ClusterCockpit/cc-lib/v2/runtime"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/util" "github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/google/gops/agent" "github.com/google/gops/agent"
"github.com/joho/godotenv" "github.com/joho/godotenv"
@@ -103,12 +102,7 @@ func initConfiguration() error {
return fmt.Errorf("main configuration must be present") return fmt.Errorf("main configuration must be present")
} }
clustercfg := ccconf.GetPackageConfig("clusters") config.Init(cfg)
if clustercfg == nil {
return fmt.Errorf("cluster configuration must be present")
}
config.Init(cfg, clustercfg)
return nil return nil
} }
@@ -277,21 +271,14 @@ func initSubsystems() error {
// Initialize job archive // Initialize job archive
archiveCfg := ccconf.GetPackageConfig("archive") archiveCfg := ccconf.GetPackageConfig("archive")
if archiveCfg == nil { if archiveCfg == nil {
cclog.Debug("Archive configuration not found, using default archive configuration")
archiveCfg = json.RawMessage(defaultArchiveConfig) archiveCfg = json.RawMessage(defaultArchiveConfig)
} }
if err := archive.Init(archiveCfg, config.Keys.DisableArchive); err != nil { if err := archive.Init(archiveCfg, config.Keys.DisableArchive); err != nil {
return fmt.Errorf("initializing archive: %w", err) return fmt.Errorf("initializing archive: %w", err)
} }
// Initialize metricdata // Note: metricstore.Init() is called later in runServer() with proper configuration
// if err := metricdata.Init(); err != nil {
// return fmt.Errorf("initializing metricdata repository: %w", err)
// }
// Initialize upstream metricdata repositories for pull worker
if err := metricdata.InitUpstreamRepos(); err != nil {
return fmt.Errorf("initializing upstream metricdata repositories: %w", err)
}
// Handle database re-initialization // Handle database re-initialization
if flagReinitDB { if flagReinitDB {
@@ -316,6 +303,8 @@ func initSubsystems() error {
// Apply tags if requested // Apply tags if requested
if flagApplyTags { if flagApplyTags {
tagger.Init()
if err := tagger.RunTaggers(); err != nil { if err := tagger.RunTaggers(); err != nil {
return fmt.Errorf("running job taggers: %w", err) return fmt.Errorf("running job taggers: %w", err)
} }
@@ -330,9 +319,14 @@ func runServer(ctx context.Context) error {
// Initialize metric store if configuration is provided // Initialize metric store if configuration is provided
mscfg := ccconf.GetPackageConfig("metric-store") mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg != nil { if mscfg != nil {
memorystore.Init(mscfg, &wg) metricstore.Init(mscfg, &wg)
// Inject repository as NodeProvider to break import cycle
ms := metricstore.GetMemoryStore()
jobRepo := repository.GetJobRepository()
ms.SetNodeProvider(jobRepo)
} else { } else {
cclog.Debug("Metric store configuration not found, skipping memorystore initialization") return fmt.Errorf("missing metricstore configuration")
} }
// Start archiver and task manager // Start archiver and task manager
@@ -375,7 +369,7 @@ func runServer(ctx context.Context) error {
case <-ctx.Done(): case <-ctx.Done():
} }
runtimeEnv.SystemdNotifiy(false, "Shutting down ...") runtime.SystemdNotify(false, "Shutting down ...")
srv.Shutdown(ctx) srv.Shutdown(ctx)
util.FsWatcherShutdown() util.FsWatcherShutdown()
taskmanager.Shutdown() taskmanager.Shutdown()
@@ -385,24 +379,39 @@ func runServer(ctx context.Context) error {
if os.Getenv(envGOGC) == "" { if os.Getenv(envGOGC) == "" {
debug.SetGCPercent(25) debug.SetGCPercent(25)
} }
runtimeEnv.SystemdNotifiy(true, "running") runtime.SystemdNotify(true, "running")
// Wait for completion or error waitDone := make(chan struct{})
go func() { go func() {
wg.Wait() wg.Wait()
close(waitDone)
}()
go func() {
<-waitDone
close(errChan) close(errChan)
}() }()
// Check for server startup errors // Wait for either:
// 1. An error from server startup
// 2. Completion of all goroutines (normal shutdown or crash)
select {
case err := <-errChan:
// errChan will be closed when waitDone is closed, which happens
// when all goroutines complete (either from normal shutdown or error)
if err != nil {
return err
}
case <-time.After(100 * time.Millisecond):
// Give the server 100ms to start and report any immediate startup errors
// After that, just wait for normal shutdown completion
select { select {
case err := <-errChan: case err := <-errChan:
if err != nil { if err != nil {
return err return err
} }
case <-time.After(100 * time.Millisecond): case <-waitDone:
// Server started successfully, wait for completion // Normal shutdown completed
if err := <-errChan; err != nil {
return err
} }
} }

View File

@@ -29,12 +29,12 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph" "github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated" "github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/internal/routerConfig" "github.com/ClusterCockpit/cc-backend/internal/routerConfig"
"github.com/ClusterCockpit/cc-backend/pkg/nats" "github.com/ClusterCockpit/cc-backend/pkg/nats"
"github.com/ClusterCockpit/cc-backend/web" "github.com/ClusterCockpit/cc-backend/web"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/runtimeEnv" "github.com/ClusterCockpit/cc-lib/v2/runtime"
"github.com/gorilla/handlers" "github.com/gorilla/handlers"
"github.com/gorilla/mux" "github.com/gorilla/mux"
httpSwagger "github.com/swaggo/http-swagger" httpSwagger "github.com/swaggo/http-swagger"
@@ -345,7 +345,7 @@ func (s *Server) Start(ctx context.Context) error {
// Because this program will want to bind to a privileged port (like 80), the listener must // Because this program will want to bind to a privileged port (like 80), the listener must
// be established first, then the user can be changed, and after that, // be established first, then the user can be changed, and after that,
// the actual http server can be started. // the actual http server can be started.
if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil { if err := runtime.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
return fmt.Errorf("dropping privileges: %w", err) return fmt.Errorf("dropping privileges: %w", err)
} }
@@ -381,7 +381,7 @@ func (s *Server) Shutdown(ctx context.Context) {
} }
// Archive all the metric store data // Archive all the metric store data
memorystore.Shutdown() metricstore.Shutdown()
// Shutdown archiver with 10 second timeout for fast shutdown // Shutdown archiver with 10 second timeout for fast shutdown
if err := archiver.Shutdown(10 * time.Second); err != nil { if err := archiver.Shutdown(10 * time.Second); err != nil {

View File

@@ -1,96 +1,23 @@
{ {
"main": { "main": {
"addr": "127.0.0.1:8080", "addr": "127.0.0.1:8080"
"short-running-jobs-duration": 300,
"resampling": {
"minimumPoints": 600,
"trigger": 180,
"resolutions": [
240,
60
]
},
"apiAllowedIPs": [
"*"
],
"emission-constant": 317
}, },
"cron": { "cron": {
"commit-job-worker": "2m", "commit-job-worker": "1m",
"duration-worker": "5m", "duration-worker": "3m",
"footprint-worker": "10m" "footprint-worker": "5m"
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
}, },
"auth": { "auth": {
"jwts": { "jwts": {
"max-age": "2000h" "max-age": "2000h"
} }
}, },
"nats": {
"address": "nats://0.0.0.0:4222",
"username": "root",
"password": "root"
},
"clusters": [
{
"name": "fritz",
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "alex",
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}
],
"metric-store": { "metric-store": {
"checkpoints": { "checkpoints": {
"file-format": "avro", "interval": "12h"
"interval": "1h",
"directory": "./var/checkpoints",
"restore": "48h"
},
"archive": {
"interval": "1h",
"directory": "./var/archive"
}, },
"retention-in-memory": "48h", "retention-in-memory": "48h",
"subscriptions": [ "memory-cap": 100
{
"subscribe-to": "hpc-nats",
"cluster-tag": "fritz"
},
{
"subscribe-to": "hpc-nats",
"cluster-tag": "alex"
}
]
} }
} }

View File

@@ -5,45 +5,62 @@
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem", "https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
"user": "clustercockpit", "user": "clustercockpit",
"group": "clustercockpit", "group": "clustercockpit",
"validate": false,
"apiAllowedIPs": ["*"], "apiAllowedIPs": ["*"],
"short-running-jobs-duration": 300, "short-running-jobs-duration": 300,
"enable-job-taggers": true,
"resampling": { "resampling": {
"minimumPoints": 600, "minimumPoints": 600,
"trigger": 180, "trigger": 180,
"resolutions": [ "resolutions": [240, 60]
240, },
60 "apiSubjects": {
] "subjectJobEvent": "cc.job.event",
"subjectNodeState": "cc.node.state"
}
},
"nats": {
"address": "nats://0.0.0.0:4222",
"username": "root",
"password": "root"
},
"auth": {
"jwts": {
"max-age": "2000h"
} }
}, },
"cron": { "cron": {
"commit-job-worker": "2m", "commit-job-worker": "1m",
"duration-worker": "5m", "duration-worker": "5m",
"footprint-worker": "10m" "footprint-worker": "10m"
}, },
"archive": { "archive": {
"kind": "file", "kind": "s3",
"path": "./var/job-archive" "endpoint": "http://x.x.x.x",
"bucket": "jobarchive",
"accessKey": "xx",
"secretKey": "xx",
"retention": {
"policy": "move",
"age": 365,
"location": "./var/archive"
}
}, },
"clusters": [ "metric-store": {
"checkpoints": {
"interval": "12h"
},
"memory-cap": 100,
"retention-in-memory": "48h",
"nats-subscriptions": [
{ {
"name": "test", "subscribe-to": "hpc-nats",
"filterRanges": { "cluster-tag": "fritz"
"numNodes": {
"from": 1,
"to": 64
}, },
"duration": { {
"from": 0, "subscribe-to": "hpc-nats",
"to": 86400 "cluster-tag": "alex"
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
} }
] ]
},
"ui-file": "ui-config.json"
} }

419
configs/tagger/README.md Normal file
View File

@@ -0,0 +1,419 @@
# Job Tagging Configuration
ClusterCockpit provides automatic job tagging functionality to classify and
categorize jobs based on configurable rules. The tagging system consists of two
main components:
1. **Application Detection** - Identifies which application a job is running
2. **Job Classification** - Analyzes job performance characteristics and applies classification tags
## Directory Structure
```
configs/tagger/
├── apps/ # Application detection patterns
│ ├── vasp.txt
│ ├── gromacs.txt
│ └── ...
└── jobclasses/ # Job classification rules
├── parameters.json
├── lowUtilization.json
├── highload.json
└── ...
```
## Activating Tagger Rules
### Step 1: Copy Configuration Files
To activate tagging, review, adapt, and copy the configuration files from
`configs/tagger/` to `var/tagger/`:
```bash
# From the cc-backend root directory
mkdir -p var/tagger
cp -r configs/tagger/apps var/tagger/
cp -r configs/tagger/jobclasses var/tagger/
```
### Step 2: Enable Tagging in Configuration
Add or set the following configuration key in the `main` section of your `config.json`:
```json
{
"enable-job-taggers": true
}
```
**Important**: Automatic tagging is disabled by default. You must explicitly
enable it by setting `enable-job-taggers: true` in the main configuration file.
### Step 3: Restart cc-backend
The tagger system automatically loads configuration from `./var/tagger/` at
startup. After copying the files and enabling the feature, restart cc-backend:
```bash
./cc-backend -server
```
### Step 4: Verify Configuration Loaded
Check the logs for messages indicating successful configuration loading:
```
[INFO] Setup file watch for ./var/tagger/apps
[INFO] Setup file watch for ./var/tagger/jobclasses
```
## How Tagging Works
### Automatic Tagging
When `enable-job-taggers` is set to `true` in the configuration, tags are
automatically applied when:
- **Job Start**: Application detection runs immediately when a job starts
- **Job Stop**: Job classification runs when a job completes
The system analyzes job metadata and metrics to determine appropriate tags.
**Note**: Automatic tagging only works for jobs that start or stop after the
feature is enabled. Existing jobs are not automatically retagged.
### Manual Tagging (Retroactive)
To apply tags to existing jobs in the database, use the `-apply-tags` command
line option:
```bash
./cc-backend -apply-tags
```
This processes all jobs in the database and applies current tagging rules. This
is useful when:
- You have existing jobs that were created before tagging was enabled
- You've added new tagging rules and want to apply them to historical data
- You've modified existing rules and want to re-evaluate all jobs
### Hot Reload
The tagger system watches the configuration directories for changes. You can
modify or add rules without restarting `cc-backend`:
- Changes to `var/tagger/apps/*` are detected automatically
- Changes to `var/tagger/jobclasses/*` are detected automatically
## Application Detection
Application detection identifies which software a job is running by matching
patterns in the job script.
### Configuration Format
Application patterns are stored in text files under `var/tagger/apps/`. Each
file contains one or more regular expression patterns (one per line) that match
against the job script.
**Example: `apps/vasp.txt`**
```
vasp
VASP
```
### How It Works
1. When a job starts, the system retrieves the job script from metadata
2. Each line in the app files is treated as a regex pattern
3. Patterns are matched case-insensitively against the lowercased job script
4. If a match is found, a tag of type `app` with the filename (without extension) is applied
5. Only the first matching application is tagged
### Adding New Applications
1. Create a new file in `var/tagger/apps/` (e.g., `tensorflow.txt`)
2. Add regex patterns, one per line:
```
tensorflow
tf\.keras
import tensorflow
```
3. The file is automatically detected and loaded
**Note**: The tag name will be the filename without the `.txt` extension (e.g., `tensorflow`).
## Job Classification
Job classification analyzes completed jobs based on their metrics and properties
to identify performance issues or characteristics.
### Configuration Format
Job classification rules are defined in JSON files under
`var/tagger/jobclasses/`. Each rule file defines:
- **Metrics required**: Which job metrics to analyze
- **Requirements**: Pre-conditions that must be met
- **Variables**: Computed values used in the rule
- **Rule expression**: Boolean expression that determines if the rule matches
- **Hint template**: Message displayed when the rule matches
### Parameters File
`jobclasses/parameters.json` defines shared threshold values used across multiple rules:
```json
{
"lowcpuload_threshold_factor": 0.9,
"highmemoryusage_threshold_factor": 0.9,
"job_min_duration_seconds": 600.0,
"sampling_interval_seconds": 30.0
}
```
### Rule File Structure
**Example: `jobclasses/lowUtilization.json`**
```json
{
"name": "Low resource utilization",
"tag": "lowutilization",
"parameters": ["job_min_duration_seconds"],
"metrics": ["flops_any", "mem_bw"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "mem_bw_perc",
"expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)"
}
],
"rule": "flops_any.avg < flops_any.limits.alert",
"hint": "Average flop rate {{.flops_any.avg}} falls below threshold {{.flops_any.limits.alert}}"
}
```
#### Field Descriptions
| Field | Description |
| -------------- | ----------------------------------------------------------------------------- |
| `name` | Human-readable description of the rule |
| `tag` | Tag identifier applied when the rule matches |
| `parameters` | List of parameter names from `parameters.json` to include in rule environment |
| `metrics` | List of metrics required for evaluation (must be present in job data) |
| `requirements` | Boolean expressions that must all be true for the rule to be evaluated |
| `variables` | Named expressions computed before evaluating the main rule |
| `rule` | Boolean expression that determines if the job matches this classification |
| `hint` | Go template string for generating a user-visible message |
### Expression Environment
Expressions in `requirements`, `variables`, and `rule` have access to:
**Job Properties:**
- `job.shared` - Shared node allocation type
- `job.duration` - Job runtime in seconds
- `job.numCores` - Number of CPU cores
- `job.numNodes` - Number of nodes
- `job.jobState` - Job completion state
- `job.numAcc` - Number of accelerators
- `job.smt` - SMT setting
**Metric Statistics (for each metric in `metrics`):**
- `<metric>.min` - Minimum value
- `<metric>.max` - Maximum value
- `<metric>.avg` - Average value
- `<metric>.limits.peak` - Peak limit from cluster config
- `<metric>.limits.normal` - Normal threshold
- `<metric>.limits.caution` - Caution threshold
- `<metric>.limits.alert` - Alert threshold
**Parameters:**
- All parameters listed in the `parameters` field
**Variables:**
- All variables defined in the `variables` array
### Expression Language
Rules use the [expr](https://github.com/expr-lang/expr) language for expressions. Supported operations:
- **Arithmetic**: `+`, `-`, `*`, `/`, `%`, `^`
- **Comparison**: `==`, `!=`, `<`, `<=`, `>`, `>=`
- **Logical**: `&&`, `||`, `!`
- **Functions**: Standard math functions (see expr documentation)
### Hint Templates
Hints use Go's `text/template` syntax. Variables from the evaluation environment are accessible:
```
{{.flops_any.avg}} # Access metric average
{{.job.duration}} # Access job property
{{.my_variable}} # Access computed variable
```
### Adding New Classification Rules
1. Create a new JSON file in `var/tagger/jobclasses/` (e.g., `memoryLeak.json`)
2. Define the rule structure:
```json
{
"name": "Memory Leak Detection",
"tag": "memory_leak",
"parameters": ["memory_leak_slope_threshold"],
"metrics": ["mem_used"],
"requirements": ["job.duration > 3600"],
"variables": [
{
"name": "mem_growth",
"expr": "(mem_used.max - mem_used.min) / job.duration"
}
],
"rule": "mem_growth > memory_leak_slope_threshold",
"hint": "Memory usage grew by {{.mem_growth}} per second"
}
```
3. Add any new parameters to `parameters.json`
4. The file is automatically detected and loaded
## Configuration Paths
The tagger system reads from these paths (relative to cc-backend working directory):
- **Application patterns**: `./var/tagger/apps/`
- **Job classification rules**: `./var/tagger/jobclasses/`
These paths are defined as constants in the source code and cannot be changed without recompiling.
## Troubleshooting
### Tags Not Applied
1. **Check tagging is enabled**: Verify `enable-job-taggers: true` is set in `config.json`
2. **Check configuration exists**:
```bash
ls -la var/tagger/apps
ls -la var/tagger/jobclasses
```
3. **Check logs for errors**:
```bash
./cc-backend -server -loglevel debug
```
4. **Verify file permissions**: Ensure cc-backend can read the configuration files
5. **For existing jobs**: Use `./cc-backend -apply-tags` to retroactively tag jobs
### Rules Not Matching
1. **Enable debug logging**: Set `loglevel: debug` to see detailed rule evaluation
2. **Check requirements**: Ensure all requirements in the rule are satisfied
3. **Verify metrics exist**: Classification rules require job metrics to be available
4. **Check metric names**: Ensure metric names match those in your cluster configuration
### File Watch Not Working
If changes to configuration files aren't detected:
1. Restart cc-backend to reload all configuration
2. Check filesystem supports file watching (network filesystems may not)
3. Check logs for file watch setup messages
## Best Practices
1. **Start Simple**: Begin with basic rules and refine based on results
2. **Use Requirements**: Filter out irrelevant jobs early with requirements
3. **Test Incrementally**: Add one rule at a time and verify behavior
4. **Document Rules**: Use descriptive names and clear hint messages
5. **Share Parameters**: Define common thresholds in `parameters.json` for consistency
6. **Version Control**: Keep your `var/tagger/` configuration in version control
7. **Backup Before Changes**: Test new rules on a copy before deploying to production
## Examples
### Simple Application Detection
**File: `var/tagger/apps/python.txt`**
```
python
python3
\.py
```
This detects jobs running Python scripts.
### Complex Classification Rule
**File: `var/tagger/jobclasses/cpuImbalance.json`**
```json
{
"name": "CPU Load Imbalance",
"tag": "cpu_imbalance",
"parameters": ["core_load_imbalance_threshold_factor"],
"metrics": ["cpu_load"],
"requirements": ["job.numCores > 1", "job.duration > 600"],
"variables": [
{
"name": "load_variance",
"expr": "(cpu_load.max - cpu_load.min) / cpu_load.avg"
}
],
"rule": "load_variance > core_load_imbalance_threshold_factor",
"hint": "CPU load varies by {{printf \"%.1f%%\" (load_variance * 100)}} across cores"
}
```
This detects jobs where CPU load is unevenly distributed across cores.
## Reference
### Configuration Options
**Main Configuration (`config.json`)**:
- `enable-job-taggers` (boolean, default: `false`) - Enables automatic job tagging system
- Must be set to `true` to activate automatic tagging on job start/stop events
- Does not affect the `-apply-tags` command line option
**Command Line Options**:
- `-apply-tags` - Apply all tagging rules to existing jobs in the database
- Works independently of `enable-job-taggers` configuration
- Useful for retroactively tagging jobs or re-evaluating with updated rules
### Default Configuration Location
The example configurations are provided in:
- `configs/tagger/apps/` - Example application patterns (16 applications)
- `configs/tagger/jobclasses/` - Example classification rules (3 rules)
Copy these to `var/tagger/` and customize for your environment.
### Tag Types
- `app` - Application tags (e.g., "vasp", "gromacs")
- `jobClass` - Classification tags (e.g., "lowutilization", "highload")
Tags can be queried and filtered in the ClusterCockpit UI and API.

87
go.mod
View File

@@ -10,16 +10,16 @@ tool (
) )
require ( require (
github.com/99designs/gqlgen v0.17.84 github.com/99designs/gqlgen v0.17.85
github.com/ClusterCockpit/cc-lib v1.0.2 github.com/ClusterCockpit/cc-lib/v2 v2.1.0
github.com/Masterminds/squirrel v1.5.4 github.com/Masterminds/squirrel v1.5.4
github.com/aws/aws-sdk-go-v2 v1.41.0 github.com/aws/aws-sdk-go-v2 v1.41.1
github.com/aws/aws-sdk-go-v2/config v1.31.20 github.com/aws/aws-sdk-go-v2/config v1.32.6
github.com/aws/aws-sdk-go-v2/credentials v1.18.24 github.com/aws/aws-sdk-go-v2/credentials v1.19.7
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2 github.com/aws/aws-sdk-go-v2/service/s3 v1.95.0
github.com/coreos/go-oidc/v3 v3.16.0 github.com/coreos/go-oidc/v3 v3.17.0
github.com/expr-lang/expr v1.17.6 github.com/expr-lang/expr v1.17.7
github.com/go-co-op/gocron/v2 v2.18.2 github.com/go-co-op/gocron/v2 v2.19.0
github.com/go-ldap/ldap/v3 v3.4.12 github.com/go-ldap/ldap/v3 v3.4.12
github.com/golang-jwt/jwt/v5 v5.3.0 github.com/golang-jwt/jwt/v5 v5.3.0
github.com/golang-migrate/migrate/v4 v4.19.1 github.com/golang-migrate/migrate/v4 v4.19.1
@@ -31,18 +31,16 @@ require (
github.com/jmoiron/sqlx v1.4.0 github.com/jmoiron/sqlx v1.4.0
github.com/joho/godotenv v1.5.1 github.com/joho/godotenv v1.5.1
github.com/linkedin/goavro/v2 v2.14.1 github.com/linkedin/goavro/v2 v2.14.1
github.com/mattn/go-sqlite3 v1.14.32 github.com/mattn/go-sqlite3 v1.14.33
github.com/nats-io/nats.go v1.47.0 github.com/nats-io/nats.go v1.47.0
github.com/prometheus/client_golang v1.23.2
github.com/prometheus/common v0.67.4
github.com/qustavo/sqlhooks/v2 v2.1.0 github.com/qustavo/sqlhooks/v2 v2.1.0
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
github.com/stretchr/testify v1.11.1 github.com/stretchr/testify v1.11.1
github.com/swaggo/http-swagger v1.3.4 github.com/swaggo/http-swagger v1.3.4
github.com/swaggo/swag v1.16.6 github.com/swaggo/swag v1.16.6
github.com/vektah/gqlparser/v2 v2.5.31 github.com/vektah/gqlparser/v2 v2.5.31
golang.org/x/crypto v0.45.0 golang.org/x/crypto v0.46.0
golang.org/x/oauth2 v0.32.0 golang.org/x/oauth2 v0.34.0
golang.org/x/time v0.14.0 golang.org/x/time v0.14.0
) )
@@ -50,22 +48,22 @@ require (
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
github.com/KyleBanks/depth v1.2.1 // indirect github.com/KyleBanks/depth v1.2.1 // indirect
github.com/agnivade/levenshtein v1.2.1 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 // indirect github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 // indirect github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.16 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 // indirect github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.7 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 // indirect github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 // indirect github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.16 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 // indirect github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 // indirect github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 // indirect
github.com/aws/smithy-go v1.24.0 // indirect github.com/aws/smithy-go v1.24.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect
@@ -85,28 +83,27 @@ require (
github.com/go-viper/mapstructure/v2 v2.4.0 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
github.com/goccy/go-yaml v1.19.0 // indirect github.com/goccy/go-yaml v1.19.0 // indirect
github.com/golang/snappy v0.0.4 // indirect github.com/golang/snappy v0.0.4 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/securecookie v1.1.2 // indirect
github.com/gorilla/websocket v1.5.3 // indirect github.com/gorilla/websocket v1.5.3 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
github.com/jonboulle/clockwork v0.5.0 // indirect github.com/jonboulle/clockwork v0.5.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect github.com/klauspost/compress v1.18.2 // indirect
github.com/json-iterator/go v1.1.12 // indirect github.com/kr/pretty v0.3.1 // indirect
github.com/klauspost/compress v1.18.1 // indirect
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/nats-io/nkeys v0.4.12 // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/nats-io/nkeys v0.4.11 // indirect
github.com/nats-io/nuid v1.0.1 // indirect github.com/nats-io/nuid v1.0.1 // indirect
github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.4 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sosodev/duration v1.3.1 // indirect github.com/sosodev/duration v1.3.1 // indirect
github.com/stmcginnis/gofish v0.20.0 // indirect
github.com/stretchr/objx v0.5.2 // indirect github.com/stretchr/objx v0.5.2 // indirect
github.com/swaggo/files v1.0.1 // indirect github.com/swaggo/files v1.0.1 // indirect
github.com/urfave/cli/v2 v2.27.7 // indirect github.com/urfave/cli/v2 v2.27.7 // indirect
@@ -114,13 +111,13 @@ require (
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/mod v0.30.0 // indirect golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect
golang.org/x/net v0.47.0 // indirect golang.org/x/mod v0.31.0 // indirect
golang.org/x/sync v0.18.0 // indirect golang.org/x/net v0.48.0 // indirect
golang.org/x/sys v0.38.0 // indirect golang.org/x/sync v0.19.0 // indirect
golang.org/x/text v0.31.0 // indirect golang.org/x/sys v0.39.0 // indirect
golang.org/x/tools v0.39.0 // indirect golang.org/x/text v0.32.0 // indirect
google.golang.org/protobuf v1.36.10 // indirect golang.org/x/tools v0.40.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect
) )

164
go.sum
View File

@@ -1,11 +1,11 @@
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/99designs/gqlgen v0.17.84 h1:iVMdiStgUVx/BFkMb0J5GAXlqfqtQ7bqMCYK6v52kQ0= github.com/99designs/gqlgen v0.17.85 h1:EkGx3U2FDcxQm8YDLQSpXIAVmpDyZ3IcBMOJi2nH1S0=
github.com/99designs/gqlgen v0.17.84/go.mod h1:qjoUqzTeiejdo+bwUg8unqSpeYG42XrcrQboGIezmFA= github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU=
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8=
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU=
github.com/ClusterCockpit/cc-lib v1.0.2 h1:ZWn3oZkXgxrr3zSigBdlOOfayZ4Om4xL20DhmritPPg= github.com/ClusterCockpit/cc-lib/v2 v2.1.0 h1:B6l6h0IjfEuY9DU6aVM3fSsj24lQ1eudXK9QTKmJjqg=
github.com/ClusterCockpit/cc-lib v1.0.2/go.mod h1:UGdOvXEnjFqlnPSxtvtFwO6BtXYW6NnXFoud9FtN93k= github.com/ClusterCockpit/cc-lib/v2 v2.1.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM=
@@ -14,6 +14,7 @@ github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObk
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM=
github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU=
github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI=
@@ -22,52 +23,57 @@ github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNg
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ= github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk= github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
github.com/aws/aws-sdk-go-v2 v1.41.0 h1:tNvqh1s+v0vFYdA1xq0aOJH+Y5cRyZ5upu6roPgPKd4= github.com/aws/aws-sdk-go-v2 v1.41.1 h1:ABlyEARCDLN034NhxlRUSZr4l71mh+T5KAeGh6cerhU=
github.com/aws/aws-sdk-go-v2 v1.41.0/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0= github.com/aws/aws-sdk-go-v2 v1.41.1/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 h1:DHctwEM8P8iTXFxC/QK0MRjwEpWQeM9yzidCRjldUz0= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 h1:489krEF9xIGkOaaX3CE/Be2uWjiXrkCH6gUX+bZA/BU=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3/go.mod h1:xdCzcZEtnSTKVDOmUZs4l/j3pSV6rpo1WXl5ugNsL8Y= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4/go.mod h1:IOAPF6oT9KCsceNTvvYMNHy0+kMF8akOjeDvPENWxp4=
github.com/aws/aws-sdk-go-v2/config v1.31.20 h1:/jWF4Wu90EhKCgjTdy1DGxcbcbNrjfBHvksEL79tfQc= github.com/aws/aws-sdk-go-v2/config v1.32.6 h1:hFLBGUKjmLAekvi1evLi5hVvFQtSo3GYwi+Bx4lpJf8=
github.com/aws/aws-sdk-go-v2/config v1.31.20/go.mod h1:95Hh1Tc5VYKL9NJ7tAkDcqeKt+MCXQB1hQZaRdJIZE0= github.com/aws/aws-sdk-go-v2/config v1.32.6/go.mod h1:lcUL/gcd8WyjCrMnxez5OXkO3/rwcNmvfno62tnXNcI=
github.com/aws/aws-sdk-go-v2/credentials v1.18.24 h1:iJ2FmPT35EaIB0+kMa6TnQ+PwG5A1prEdAw+PsMzfHg= github.com/aws/aws-sdk-go-v2/credentials v1.19.7 h1:tHK47VqqtJxOymRrNtUXN5SP/zUTvZKeLx4tH6PGQc8=
github.com/aws/aws-sdk-go-v2/credentials v1.18.24/go.mod h1:U91+DrfjAiXPDEGYhh/x29o4p0qHX5HDqG7y5VViv64= github.com/aws/aws-sdk-go-v2/credentials v1.19.7/go.mod h1:qOZk8sPDrxhf+4Wf4oT2urYJrYt3RejHSzgAquYeppw=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 h1:T1brd5dR3/fzNFAQch/iBKeX07/ffu/cLu+q+RuzEWk= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 h1:I0GyV8wiYrP8XpA70g1HBcQO1JlQxCMTW9npl5UbDHY=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13/go.mod h1:Peg/GBAQ6JDt+RoBf4meB1wylmAipb7Kg2ZFakZTlwk= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17/go.mod h1:tyw7BOl5bBe/oqvoIeECFJjMdzXoa/dfVz3QQ5lgHGA=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 h1:a+8/MLcWlIxo1lF9xaGt3J/u3yOZx+CdSveSNwjhD40= github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 h1:xOLELNKGp2vsiteLsvLPwxC+mYmO6OZ8PYgiuPJzF8U=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13/go.mod h1:oGnKwIYZ4XttyU2JWxFrwvhF6YKiK/9/wmE3v3Iu9K8= github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17/go.mod h1:5M5CI3D12dNOtH3/mk6minaRwI2/37ifCURZISxA/IQ=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 h1:HBSI2kDkMdWz4ZM7FjwE7e/pWDEZ+nR95x8Ztet1ooY= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 h1:WWLqlh79iO48yLkj1v3ISRNiv+3KdQoZ6JWyfcsyQik=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13/go.mod h1:YE94ZoDArI7awZqJzBAZ3PDD2zSfuP7w6P2knOzIn8M= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17/go.mod h1:EhG22vHRrvF8oXSTYStZhJc1aUgKtnJe+aOiFEV90cM=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 h1:eg/WYAa12vqTphzIdWMzqYRVKKnCboVPRlvaybNCqPA= github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.16 h1:CjMzUs78RDDv4ROu3JnJn/Ig1r6ZD7/T2DXLLRpejic=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13/go.mod h1:/FDdxWhz1486obGrKKC1HONd7krpk38LBt+dutLcN9k= github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.16/go.mod h1:uVW4OLBqbJXSHJYA9svT9BluSvvwbzLQ2Crf6UPzR3c=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 h1:x2Ibm/Af8Fi+BH+Hsn9TXGdT+hKbDd5XOTZxTMxDk7o= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 h1:0ryTNEdJbzUCEWkVXEXoqlXV72J5keC1GvILMOuD00E=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3/go.mod h1:IW1jwyrQgMdhisceG8fQLmQIydcT/jWY21rFhzgaKwo= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4/go.mod h1:HQ4qwNZh32C3CBeO6iJLQlgtMzqeG17ziAA/3KDJFow=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 h1:NvMjwvv8hpGUILarKw7Z4Q0w1H9anXKsesMxtw++MA4= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.7 h1:DIBqIrJ7hv+e4CmIk2z3pyKT+3B6qVMgRsawHiR3qso=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4/go.mod h1:455WPHSwaGj2waRSpQp7TsnpOnBfw8iDfPfbwl7KPJE= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.7/go.mod h1:vLm00xmBke75UmpNvOcZQ/Q30ZFjbczeLFqGx5urmGo=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 h1:kDqdFvMY4AtKoACfzIGD8A0+hbT41KTKF//gq7jITfM= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 h1:RuNSMoozM8oXlgLG/n6WLaFGoea7/CddrCfIiSA+xdY=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13/go.mod h1:lmKuogqSU3HzQCwZ9ZtcqOc5XGMqtDK7OIc2+DxiUEg= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17/go.mod h1:F2xxQ9TZz5gDWsclCtPQscGpP0VUOc8RqgFM3vDENmU=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 h1:zhBJXdhWIFZ1acfDYIhu4+LCzdUS2Vbcum7D01dXlHQ= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.16 h1:NSbvS17MlI2lurYgXnCOLvCFX38sBW4eiVER7+kkgsU=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13/go.mod h1:JaaOeCE368qn2Hzi3sEzY6FgAZVCIYcC2nwbro2QCh8= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.16/go.mod h1:SwT8Tmqd4sA6G1qaGdzWCJN99bUmPGHfRwwq3G5Qb+A=
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2 h1:DhdbtDl4FdNlj31+xiRXANxEE+eC7n8JQz+/ilwQ8Uc= github.com/aws/aws-sdk-go-v2/service/s3 v1.95.0 h1:MIWra+MSq53CFaXXAywB2qg9YvVZifkk6vEGl/1Qor0=
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2/go.mod h1:+wArOOrcHUevqdto9k1tKOF5++YTe9JEcPSc9Tx2ZSw= github.com/aws/aws-sdk-go-v2/service/s3 v1.95.0/go.mod h1:79S2BdqCJpScXZA2y+cpZuocWsjGjJINyXnOsf5DTz8=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 h1:NjShtS1t8r5LUfFVtFeI8xLAHQNTa7UI0VawXlrBMFQ= github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 h1:VrhDvQib/i0lxvr3zqlUwLwJP4fpmpyD9wYG1vfSu+Y=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3/go.mod h1:fKvyjJcz63iL/ftA6RaM8sRCtN4r4zl4tjL3qw5ec7k= github.com/aws/aws-sdk-go-v2/service/signin v1.0.5/go.mod h1:k029+U8SY30/3/ras4G/Fnv/b88N4mAfliNn08Dem4M=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 h1:gTsnx0xXNQ6SBbymoDvcoRHL+q4l/dAFsQuKfDWSaGc= github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 h1:v6EiMvhEYBoHABfbGB4alOYmCIrcgyPPiBE1wZAEbqk=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7/go.mod h1:klO+ejMvYsB4QATfEOIXk8WAEwN4N0aBfJpvC+5SZBo= github.com/aws/aws-sdk-go-v2/service/sso v1.30.9/go.mod h1:yifAsgBxgJWn3ggx70A3urX2AN49Y5sJTD1UQFlfqBw=
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 h1:HK5ON3KmQV2HcAunnx4sKLB9aPf3gKGwVAf7xnx0QT0= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13 h1:gd84Omyu9JLriJVCbGApcLzVR3XtmC4ZDPcAI6Ftvds=
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2/go.mod h1:E19xDjpzPZC7LS2knI9E6BaRFDK43Eul7vd6rSq2HWk= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13/go.mod h1:sTGThjphYE4Ohw8vJiRStAcu3rbjtXRsdNB0TvZ5wwo=
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 h1:5fFjR/ToSOzB2OQ/XqWpZBmNvmP/pJ1jOWYlFDJTjRQ=
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6/go.mod h1:qgFDZQSD/Kys7nJnVqYlWKnh0SSdMjAi0uSwON4wgYQ=
github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk= github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk=
github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/coreos/go-oidc/v3 v3.16.0 h1:qRQUCFstKpXwmEjDQTIbyY/5jF00+asXzSkmkoa/mow= github.com/coreos/go-oidc/v3 v3.17.0 h1:hWBGaQfbi0iVviX4ibC7bk8OKT5qNr4klBaCHVNvehc=
github.com/coreos/go-oidc/v3 v3.16.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8= github.com/coreos/go-oidc/v3 v3.17.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8=
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
@@ -77,8 +83,8 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo=
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
github.com/expr-lang/expr v1.17.6 h1:1h6i8ONk9cexhDmowO/A64VPxHScu7qfSl2k8OlINec= github.com/expr-lang/expr v1.17.7 h1:Q0xY/e/2aCIp8g9s/LGvMDCC5PxYlvHgDZRQ4y16JX8=
github.com/expr-lang/expr v1.17.6/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/expr-lang/expr v1.17.7/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
@@ -89,8 +95,8 @@ github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 h1:BP4M0CvQ4S3TGls2FvczZtj5Re/2ZzkV9VwqPHH/3Bo= github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 h1:BP4M0CvQ4S3TGls2FvczZtj5Re/2ZzkV9VwqPHH/3Bo=
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0=
github.com/go-co-op/gocron/v2 v2.18.2 h1:+5VU41FUXPWSPKLXZQ/77SGzUiPCcakU0v7ENc2H20Q= github.com/go-co-op/gocron/v2 v2.19.0 h1:OKf2y6LXPs/BgBI2fl8PxUpNAI1DA9Mg+hSeGOS38OU=
github.com/go-co-op/gocron/v2 v2.18.2/go.mod h1:Zii6he+Zfgy5W9B+JKk/KwejFOW0kZTFvHtwIpR4aBI= github.com/go-co-op/gocron/v2 v2.19.0/go.mod h1:5lEiCKk1oVJV39Zg7/YG10OnaVrDAV5GGR6O0663k6U=
github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs= github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs=
github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08= github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08=
github.com/go-ldap/ldap/v3 v3.4.12 h1:1b81mv7MagXZ7+1r7cLTWmyuTqVqdwbtJSjC0DAp9s4= github.com/go-ldap/ldap/v3 v3.4.12 h1:1b81mv7MagXZ7+1r7cLTWmyuTqVqdwbtJSjC0DAp9s4=
@@ -140,7 +146,8 @@ github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark= github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark=
@@ -190,12 +197,9 @@ github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/jonboulle/clockwork v0.5.0 h1:Hyh9A8u51kptdkR+cqRpT1EebBwTn1oK9YfGYbdFz6I= github.com/jonboulle/clockwork v0.5.0 h1:Hyh9A8u51kptdkR+cqRpT1EebBwTn1oK9YfGYbdFz6I=
github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7XN3SzBPjZF60= github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7XN3SzBPjZF60=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/klauspost/compress v1.18.1 h1:bcSGx7UbpBqMChDtsF28Lw6v/G94LPrrbMbdC3JH2co=
github.com/klauspost/compress v1.18.1/go.mod h1:ZQFFVG+MdnR0P+l6wpXgIL4NTtwiKIdBnrBd8Nrxr+0=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
@@ -214,27 +218,27 @@ github.com/linkedin/goavro/v2 v2.14.1 h1:/8VjDpd38PRsy02JS0jflAu7JZPfJcGTwqWgMkF
github.com/linkedin/goavro/v2 v2.14.1/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk= github.com/linkedin/goavro/v2 v2.14.1/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk=
github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= github.com/mattn/go-sqlite3 v1.14.33 h1:A5blZ5ulQo2AtayQ9/limgHEkFreKj1Dv226a1K73s0=
github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mattn/go-sqlite3 v1.14.33/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
github.com/nats-io/nats.go v1.47.0 h1:YQdADw6J/UfGUd2Oy6tn4Hq6YHxCaJrVKayxxFqYrgM= github.com/nats-io/nats.go v1.47.0 h1:YQdADw6J/UfGUd2Oy6tn4Hq6YHxCaJrVKayxxFqYrgM=
github.com/nats-io/nats.go v1.47.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g= github.com/nats-io/nats.go v1.47.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0= github.com/nats-io/nkeys v0.4.12 h1:nssm7JKOG9/x4J8II47VWCL1Ds29avyiQDRn0ckMvDc=
github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE= github.com/nats-io/nkeys v0.4.12/go.mod h1:MT59A1HYcjIcyQDJStTfaOY6vhy9XTUjOFo+SVsvpBg=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro= github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@@ -250,6 +254,7 @@ github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAi
github.com/qustavo/sqlhooks/v2 v2.1.0/go.mod h1:aMREyKo7fOKTwiLuWPsaHRXEmtqG4yREztO0idF83AU= github.com/qustavo/sqlhooks/v2 v2.1.0/go.mod h1:aMREyKo7fOKTwiLuWPsaHRXEmtqG4yREztO0idF83AU=
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
@@ -260,6 +265,9 @@ github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4= github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4=
github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg= github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg=
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
github.com/stmcginnis/gofish v0.20.0 h1:hH2V2Qe898F2wWT1loApnkDUrXXiLKqbSlMaH3Y1n08=
github.com/stmcginnis/gofish v0.20.0/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
@@ -294,33 +302,33 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk= golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI=
golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc= golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY= golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
@@ -328,19 +336,19 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA=
golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=

View File

@@ -52,51 +52,51 @@ models:
- github.com/99designs/gqlgen/graphql.Int64 - github.com/99designs/gqlgen/graphql.Int64
- github.com/99designs/gqlgen/graphql.Int32 - github.com/99designs/gqlgen/graphql.Int32
Job: Job:
model: "github.com/ClusterCockpit/cc-lib/schema.Job" model: "github.com/ClusterCockpit/cc-lib/v2/schema.Job"
fields: fields:
tags: tags:
resolver: true resolver: true
metaData: metaData:
resolver: true resolver: true
Cluster: Cluster:
model: "github.com/ClusterCockpit/cc-lib/schema.Cluster" model: "github.com/ClusterCockpit/cc-lib/v2/schema.Cluster"
fields: fields:
partitions: partitions:
resolver: true resolver: true
# Node: # Node:
# model: "github.com/ClusterCockpit/cc-lib/schema.Node" # model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node"
# fields: # fields:
# metaData: # metaData:
# resolver: true # resolver: true
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/schema.Float" } NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricScope" } MetricScope: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricScope" }
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricValue" } MetricValue: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricValue" }
JobStatistics: JobStatistics:
{ model: "github.com/ClusterCockpit/cc-lib/schema.JobStatistics" } { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobStatistics" }
GlobalMetricListItem: GlobalMetricListItem:
{ model: "github.com/ClusterCockpit/cc-lib/schema.GlobalMetricListItem" } { model: "github.com/ClusterCockpit/cc-lib/v2/schema.GlobalMetricListItem" }
ClusterSupport: ClusterSupport:
{ model: "github.com/ClusterCockpit/cc-lib/schema.ClusterSupport" } { model: "github.com/ClusterCockpit/cc-lib/v2/schema.ClusterSupport" }
Tag: { model: "github.com/ClusterCockpit/cc-lib/schema.Tag" } Tag: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-lib/schema.Resource" } Resource: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Resource" }
JobState: { model: "github.com/ClusterCockpit/cc-lib/schema.JobState" } JobState: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobState" }
Node: { model: "github.com/ClusterCockpit/cc-lib/schema.Node" } Node: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node" }
SchedulerState: SchedulerState:
{ model: "github.com/ClusterCockpit/cc-lib/schema.SchedulerState" } { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SchedulerState" }
HealthState: HealthState:
{ model: "github.com/ClusterCockpit/cc-lib/schema.MonitoringState" } { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MonitoringState" }
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/schema.JobMetric" } JobMetric: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-lib/schema.Series" } Series: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Series" }
MetricStatistics: MetricStatistics:
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricStatistics" } { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricStatistics" }
MetricConfig: MetricConfig:
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricConfig" } { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricConfig" }
SubClusterConfig: SubClusterConfig:
{ model: "github.com/ClusterCockpit/cc-lib/schema.SubClusterConfig" } { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubClusterConfig" }
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/schema.Accelerator" } Accelerator: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Accelerator" }
Topology: { model: "github.com/ClusterCockpit/cc-lib/schema.Topology" } Topology: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Topology" }
FilterRanges: FilterRanges:
{ model: "github.com/ClusterCockpit/cc-lib/schema.FilterRanges" } { model: "github.com/ClusterCockpit/cc-lib/v2/schema.FilterRanges" }
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/schema.SubCluster" } SubCluster: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubCluster" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/schema.StatsSeries" } StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-lib/schema.Unit" } Unit: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Unit" }

View File

@@ -17,26 +17,27 @@ import (
"strings" "strings"
"testing" "testing"
"time" "time"
"sync"
"github.com/ClusterCockpit/cc-backend/internal/api" "github.com/ClusterCockpit/cc-backend/internal/api"
"github.com/ClusterCockpit/cc-backend/internal/archiver" "github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph" "github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/gorilla/mux" "github.com/gorilla/mux"
_ "github.com/mattn/go-sqlite3" _ "github.com/mattn/go-sqlite3"
) )
func setup(t *testing.T) *api.RestAPI { func setup(t *testing.T) *api.RestAPI {
repository.ResetConnection()
const testconfig = `{ const testconfig = `{
"main": { "main": {
"addr": "0.0.0.0:8080", "addr": "0.0.0.0:8080",
@@ -53,17 +54,7 @@ func setup(t *testing.T) *api.RestAPI {
"jwts": { "jwts": {
"max-age": "2m" "max-age": "2m"
} }
},
"clusters": [
{
"name": "testcluster",
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
} }
}
]
}` }`
const testclusterJSON = `{ const testclusterJSON = `{
"name": "testcluster", "name": "testcluster",
@@ -155,11 +146,7 @@ func setup(t *testing.T) *api.RestAPI {
// Load and check main configuration // Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil { if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { config.Init(cfg)
config.Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
} else { } else {
cclog.Abort("Main configuration must be present") cclog.Abort("Main configuration must be present")
} }
@@ -171,13 +158,7 @@ func setup(t *testing.T) *api.RestAPI {
t.Fatal(err) t.Fatal(err)
} }
// Initialize memorystore (optional - will return nil if not configured) // metricstore initialization removed - it's initialized via callback in tests
// For this test, we don't initialize it to test the nil handling
mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg != nil {
var wg sync.WaitGroup
memorystore.Init(mscfg, &wg)
}
archiver.Start(repository.GetJobRepository(), context.Background()) archiver.Start(repository.GetJobRepository(), context.Background())
@@ -194,30 +175,45 @@ func setup(t *testing.T) *api.RestAPI {
} }
func cleanup() { func cleanup() {
// Gracefully shutdown archiver with timeout
if err := archiver.Shutdown(5 * time.Second); err != nil { if err := archiver.Shutdown(5 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown timeout in tests: %v", err) cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
} }
// Shutdown memorystore if it was initialized
memorystore.Shutdown()
} }
/* /*
* This function starts a job, stops it, and tests the REST API. * This function starts a job, stops it, and then reads its data from the job-archive.
* Do not run sub-tests in parallel! Tests should not be run in parallel at all, because * Do not run sub-tests in parallel! Tests should not be run in parallel at all, because
* at least `setup` modifies global state. * at least `setup` modifies global state.
*/ */
func TestRestApi(t *testing.T) { func TestRestApi(t *testing.T) {
restapi := setup(t) restapi := setup(t)
t.Cleanup(cleanup) t.Cleanup(cleanup)
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
},
},
}
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
r := mux.NewRouter() r := mux.NewRouter()
r.PathPrefix("/api").Subrouter() r.PathPrefix("/api").Subrouter()
r.StrictSlash(true) r.StrictSlash(true)
restapi.MountAPIRoutes(r) restapi.MountAPIRoutes(r)
var TestJobId int64 = 123 var TestJobID int64 = 123
TestClusterName := "testcluster" TestClusterName := "testcluster"
var TestStartTime int64 = 123456789 var TestStartTime int64 = 123456789
@@ -265,12 +261,18 @@ func TestRestApi(t *testing.T) {
if response.StatusCode != http.StatusCreated { if response.StatusCode != http.StatusCreated {
t.Fatal(response.Status, recorder.Body.String()) t.Fatal(response.Status, recorder.Body.String())
} }
// resolver := graph.GetResolverInstance()
restapi.JobRepository.SyncJobs() restapi.JobRepository.SyncJobs()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
// job.Tags, err = resolver.Job().Tags(ctx, job)
// if err != nil {
// t.Fatal(err)
// }
if job.JobID != 123 || if job.JobID != 123 ||
job.User != "testuser" || job.User != "testuser" ||
job.Project != "testproj" || job.Project != "testproj" ||
@@ -288,6 +290,10 @@ func TestRestApi(t *testing.T) {
job.StartTime != 123456789 { job.StartTime != 123456789 {
t.Fatalf("unexpected job properties: %#v", job) t.Fatalf("unexpected job properties: %#v", job)
} }
// if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" {
// t.Fatalf("unexpected tags: %#v", job.Tags)
// }
}); !ok { }); !ok {
return return
} }
@@ -301,6 +307,7 @@ func TestRestApi(t *testing.T) {
"stopTime": 123457789 "stopTime": 123457789
}` }`
var stoppedJob *schema.Job
if ok := t.Run("StopJob", func(t *testing.T) { if ok := t.Run("StopJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody))) req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
recorder := httptest.NewRecorder() recorder := httptest.NewRecorder()
@@ -314,7 +321,7 @@ func TestRestApi(t *testing.T) {
} }
// Archiving happens asynchronously, will be completed in cleanup // Archiving happens asynchronously, will be completed in cleanup
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@@ -336,12 +343,21 @@ func TestRestApi(t *testing.T) {
t.Fatalf("unexpected job.metaData: %#v", job.MetaData) t.Fatalf("unexpected job.metaData: %#v", job.MetaData)
} }
stoppedJob = job
}); !ok { }); !ok {
return return
} }
// Note: We skip the CheckArchive test because without memorystore initialized, t.Run("CheckArchive", func(t *testing.T) {
// archiving will fail gracefully. This test now focuses on the REST API itself. data, err := metricdispatch.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(data, testData) {
t.Fatal("unexpected data fetched from archive")
}
})
t.Run("CheckDoubleStart", func(t *testing.T) { t.Run("CheckDoubleStart", func(t *testing.T) {
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart! // Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!

View File

@@ -13,7 +13,7 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
// GetClustersAPIResponse model // GetClustersAPIResponse model
@@ -27,7 +27,7 @@ type GetClustersAPIResponse struct {
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter. // @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
// @produce json // @produce json
// @param cluster query string false "Job Cluster" // @param cluster query string false "Job Cluster"
// @success 200 {object} api.GetClustersApiResponse "Array of clusters" // @success 200 {object} api.GetClustersAPIResponse "Array of clusters"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden" // @failure 403 {object} api.ErrorResponse "Forbidden"

File diff suppressed because it is too large Load Diff

View File

@@ -22,11 +22,11 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/graph" "github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/importer" "github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher" "github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/gorilla/mux" "github.com/gorilla/mux"
) )
@@ -104,7 +104,7 @@ type JobMetricWithName struct {
// @param items-per-page query int false "Items per page (Default: 25)" // @param items-per-page query int false "Items per page (Default: 25)"
// @param page query int false "Page Number (Default: 1)" // @param page query int false "Page Number (Default: 1)"
// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response" // @param with-metadata query bool false "Include metadata (e.g. jobScript) in response"
// @success 200 {object} api.GetJobsApiResponse "Job array and page info" // @success 200 {object} api.GetJobsAPIResponse "Job array and page info"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden" // @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -232,7 +232,7 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
// @produce json // @produce json
// @param id path int true "Database ID of Job" // @param id path int true "Database ID of Job"
// @param all-metrics query bool false "Include all available metrics" // @param all-metrics query bool false "Include all available metrics"
// @success 200 {object} api.GetJobApiResponse "Job resource" // @success 200 {object} api.GetJobAPIResponse "Job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden" // @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -293,7 +293,7 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
} }
if r.URL.Query().Get("all-metrics") == "true" { if r.URL.Query().Get("all-metrics") == "true" {
data, err = metricdispatcher.LoadData(job, nil, scopes, r.Context(), resolution) data, err = metricdispatch.LoadData(job, nil, scopes, r.Context(), resolution)
if err != nil { if err != nil {
cclog.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster) cclog.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster)
return return
@@ -324,8 +324,8 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
// @accept json // @accept json
// @produce json // @produce json
// @param id path int true "Database ID of Job" // @param id path int true "Database ID of Job"
// @param request body api.GetJobApiRequest true "Array of metric names" // @param request body api.GetJobAPIRequest true "Array of metric names"
// @success 200 {object} api.GetJobApiResponse "Job resource" // @success 200 {object} api.GetJobAPIResponse "Job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden" // @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -389,7 +389,7 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
resolution = max(resolution, mc.Timestep) resolution = max(resolution, mc.Timestep)
} }
data, err := metricdispatcher.LoadData(job, metrics, scopes, r.Context(), resolution) data, err := metricdispatch.LoadData(job, metrics, scopes, r.Context(), resolution)
if err != nil { if err != nil {
cclog.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster) cclog.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster)
return return
@@ -478,7 +478,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
// @accept json // @accept json
// @produce json // @produce json
// @param id path int true "Job Database ID" // @param id path int true "Job Database ID"
// @param request body api.TagJobApiRequest true "Array of tag-objects to add" // @param request body api.TagJobAPIRequest true "Array of tag-objects to add"
// @success 200 {object} schema.Job "Updated job resource" // @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -542,7 +542,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
// @accept json // @accept json
// @produce json // @produce json
// @param id path int true "Job Database ID" // @param id path int true "Job Database ID"
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" // @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
// @success 200 {object} schema.Job "Updated job resource" // @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -606,7 +606,7 @@ func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
// @description Tag wills be removed from respective archive files. // @description Tag wills be removed from respective archive files.
// @accept json // @accept json
// @produce plain // @produce plain
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" // @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
// @success 200 {string} string "Success Response" // @success 200 {string} string "Success Response"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -650,7 +650,7 @@ func (api *RestAPI) removeTags(rw http.ResponseWriter, r *http.Request) {
// @accept json // @accept json
// @produce json // @produce json
// @param request body schema.Job true "Job to add" // @param request body schema.Job true "Job to add"
// @success 201 {object} api.DefaultApiResponse "Job added successfully" // @success 201 {object} api.DefaultAPIResponse "Job added successfully"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden" // @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -728,7 +728,7 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
// @description Job to stop is specified by request body. All fields are required in this case. // @description Job to stop is specified by request body. All fields are required in this case.
// @description Returns full job resource information according to 'Job' scheme. // @description Returns full job resource information according to 'Job' scheme.
// @produce json // @produce json
// @param request body api.StopJobApiRequest true "All fields required" // @param request body api.StopJobAPIRequest true "All fields required"
// @success 200 {object} schema.Job "Success message" // @success 200 {object} schema.Job "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -754,7 +754,6 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
return return
} }
// cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req)
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
if err != nil { if err != nil {
// Try cached jobs if not found in main repository // Try cached jobs if not found in main repository
@@ -776,7 +775,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
// @description Job to remove is specified by database ID. This will not remove the job from the job archive. // @description Job to remove is specified by database ID. This will not remove the job from the job archive.
// @produce json // @produce json
// @param id path int true "Database ID of Job" // @param id path int true "Database ID of Job"
// @success 200 {object} api.DefaultApiResponse "Success message" // @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden" // @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -820,8 +819,8 @@ func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
// @description Job to delete is specified by request body. All fields are required in this case. // @description Job to delete is specified by request body. All fields are required in this case.
// @accept json // @accept json
// @produce json // @produce json
// @param request body api.DeleteJobApiRequest true "All fields required" // @param request body api.DeleteJobAPIRequest true "All fields required"
// @success 200 {object} api.DefaultApiResponse "Success message" // @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden" // @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -873,7 +872,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive. // @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.
// @produce json // @produce json
// @param ts path int true "Unix epoch timestamp" // @param ts path int true "Unix epoch timestamp"
// @success 200 {object} api.DefaultApiResponse "Success message" // @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden" // @failure 403 {object} api.ErrorResponse "Forbidden"

View File

@@ -15,8 +15,8 @@ import (
"strconv" "strconv"
"strings" "strings"
"github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/influxdata/line-protocol/v2/lineprotocol" "github.com/influxdata/line-protocol/v2/lineprotocol"
) )
@@ -58,7 +58,7 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
return return
} }
ms := memorystore.GetMemoryStore() ms := metricstore.GetMemoryStore()
n := 0 n := 0
for _, sel := range selectors { for _, sel := range selectors {
bn, err := ms.Free(sel, to) bn, err := ms.Free(sel, to)
@@ -97,9 +97,9 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) {
return return
} }
ms := memorystore.GetMemoryStore() ms := metricstore.GetMemoryStore()
dec := lineprotocol.NewDecoderWithBytes(bytes) dec := lineprotocol.NewDecoderWithBytes(bytes)
if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil { if err := metricstore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
cclog.Errorf("/api/write error: %s", err.Error()) cclog.Errorf("/api/write error: %s", err.Error())
handleError(err, http.StatusBadRequest, rw) handleError(err, http.StatusBadRequest, rw)
return return
@@ -129,7 +129,7 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
selector = strings.Split(raw, ":") selector = strings.Split(raw, ":")
} }
ms := memorystore.GetMemoryStore() ms := metricstore.GetMemoryStore()
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil { if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw) handleError(err, http.StatusBadRequest, rw)
return return
@@ -162,7 +162,7 @@ func metricsHealth(rw http.ResponseWriter, r *http.Request) {
selector := []string{rawCluster, rawNode} selector := []string{rawCluster, rawNode}
ms := memorystore.GetMemoryStore() ms := metricstore.GetMemoryStore()
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil { if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw) handleError(err, http.StatusBadRequest, rw)
return return

View File

@@ -6,9 +6,9 @@
package api package api
import ( import (
"bytes"
"database/sql" "database/sql"
"encoding/json" "encoding/json"
"strings"
"sync" "sync"
"time" "time"
@@ -17,12 +17,48 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/importer" "github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/nats" "github.com/ClusterCockpit/cc-backend/pkg/nats"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"github.com/ClusterCockpit/cc-lib/v2/receivers"
"github.com/ClusterCockpit/cc-lib/v2/schema"
influx "github.com/influxdata/line-protocol/v2/lineprotocol"
) )
// NatsAPI provides NATS subscription-based handlers for Job and Node operations. // NatsAPI provides NATS subscription-based handlers for Job and Node operations.
// It mirrors the functionality of the REST API but uses NATS messaging. // It mirrors the functionality of the REST API but uses NATS messaging with
// InfluxDB line protocol as the message format.
//
// # Message Format
//
// All NATS messages use InfluxDB line protocol format (https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/)
// with the following structure:
//
// measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp
//
// # Job Events
//
// Job start/stop events use the "job" measurement with a "function" tag to distinguish operations:
//
// job,function=start_job event="{...JSON payload...}" <timestamp>
// job,function=stop_job event="{...JSON payload...}" <timestamp>
//
// The JSON payload in the "event" field follows the schema.Job or StopJobAPIRequest structure.
//
// Example job start message:
//
// job,function=start_job event="{\"jobId\":1001,\"user\":\"testuser\",\"cluster\":\"testcluster\",...}" 1234567890000000000
//
// # Node State Events
//
// Node state updates use the "nodestate" measurement with cluster information:
//
// nodestate event="{...JSON payload...}" <timestamp>
//
// The JSON payload follows the UpdateNodeStatesRequest structure.
//
// Example node state message:
//
// nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node01\",\"states\":[\"idle\"]}]}" 1234567890000000000
type NatsAPI struct { type NatsAPI struct {
JobRepository *repository.JobRepository JobRepository *repository.JobRepository
// RepositoryMutex protects job creation operations from race conditions // RepositoryMutex protects job creation operations from race conditions
@@ -50,11 +86,7 @@ func (api *NatsAPI) StartSubscriptions() error {
s := config.Keys.APISubjects s := config.Keys.APISubjects
if err := client.Subscribe(s.SubjectJobStart, api.handleStartJob); err != nil { if err := client.Subscribe(s.SubjectJobEvent, api.handleJobEvent); err != nil {
return err
}
if err := client.Subscribe(s.SubjectJobStop, api.handleStopJob); err != nil {
return err return err
} }
@@ -67,26 +99,96 @@ func (api *NatsAPI) StartSubscriptions() error {
return nil return nil
} }
// processJobEvent routes job event messages to the appropriate handler based on the "function" tag.
// Validates that required tags and fields are present before processing.
func (api *NatsAPI) processJobEvent(msg lp.CCMessage) {
function, ok := msg.GetTag("function")
if !ok {
cclog.Errorf("Job event is missing required tag 'function': measurement=%s", msg.Name())
return
}
switch function {
case "start_job":
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Job start event is missing event field with JSON payload")
return
}
api.handleStartJob(v)
case "stop_job":
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Job stop event is missing event field with JSON payload")
return
}
api.handleStopJob(v)
default:
cclog.Warnf("Unknown job event function '%s', expected 'start_job' or 'stop_job'", function)
}
}
// handleJobEvent processes job-related messages received via NATS using InfluxDB line protocol.
// The message must be in line protocol format with measurement="job" and include:
// - tag "function" with value "start_job" or "stop_job"
// - field "event" containing JSON payload (schema.Job or StopJobAPIRequest)
//
// Example: job,function=start_job event="{\"jobId\":1001,...}" 1234567890000000000
func (api *NatsAPI) handleJobEvent(subject string, data []byte) {
if len(data) == 0 {
cclog.Warnf("NATS %s: received empty message", subject)
return
}
d := influx.NewDecoderWithBytes(data)
for d.Next() {
m, err := receivers.DecodeInfluxMessage(d)
if err != nil {
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
return
}
if !m.IsEvent() {
cclog.Debugf("NATS %s: received non-event message, skipping", subject)
continue
}
if m.Name() == "job" {
api.processJobEvent(m)
} else {
cclog.Debugf("NATS %s: unexpected measurement name '%s', expected 'job'", subject, m.Name())
}
}
}
// handleStartJob processes job start messages received via NATS. // handleStartJob processes job start messages received via NATS.
// Expected JSON payload follows the schema.Job structure. // The payload parameter contains JSON following the schema.Job structure.
func (api *NatsAPI) handleStartJob(subject string, data []byte) { // Jobs are validated, checked for duplicates, and inserted into the database.
func (api *NatsAPI) handleStartJob(payload string) {
if payload == "" {
cclog.Error("NATS start job: payload is empty")
return
}
req := schema.Job{ req := schema.Job{
Shared: "none", Shared: "none",
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
} }
dec := json.NewDecoder(bytes.NewReader(data)) dec := json.NewDecoder(strings.NewReader(payload))
dec.DisallowUnknownFields() dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil { if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err) cclog.Errorf("NATS start job: parsing request failed: %v", err)
return return
} }
cclog.Debugf("NATS %s: %s", subject, req.GoString()) cclog.Debugf("NATS start job: %s", req.GoString())
req.State = schema.JobStateRunning req.State = schema.JobStateRunning
if err := importer.SanityChecks(&req); err != nil { if err := importer.SanityChecks(&req); err != nil {
cclog.Errorf("NATS %s: sanity check failed: %v", subject, err) cclog.Errorf("NATS start job: sanity check failed: %v", err)
return return
} }
@@ -96,14 +198,14 @@ func (api *NatsAPI) handleStartJob(subject string, data []byte) {
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil) jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
if err != nil && err != sql.ErrNoRows { if err != nil && err != sql.ErrNoRows {
cclog.Errorf("NATS %s: checking for duplicate failed: %v", subject, err) cclog.Errorf("NATS start job: checking for duplicate failed: %v", err)
return return
} }
if err == nil { if err == nil {
for _, job := range jobs { for _, job := range jobs {
if (req.StartTime - job.StartTime) < secondsPerDay { if (req.StartTime - job.StartTime) < secondsPerDay {
cclog.Errorf("NATS %s: job with jobId %d, cluster %s already exists (dbid: %d)", cclog.Errorf("NATS start job: job with jobId %d, cluster %s already exists (dbid: %d)",
subject, req.JobID, req.Cluster, job.ID) req.JobID, req.Cluster, job.ID)
return return
} }
} }
@@ -111,14 +213,14 @@ func (api *NatsAPI) handleStartJob(subject string, data []byte) {
id, err := api.JobRepository.Start(&req) id, err := api.JobRepository.Start(&req)
if err != nil { if err != nil {
cclog.Errorf("NATS %s: insert into database failed: %v", subject, err) cclog.Errorf("NATS start job: insert into database failed: %v", err)
return return
} }
unlockOnce.Do(api.RepositoryMutex.Unlock) unlockOnce.Do(api.RepositoryMutex.Unlock)
for _, tag := range req.Tags { for _, tag := range req.Tags {
if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil { if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
cclog.Errorf("NATS %s: adding tag to new job %d failed: %v", subject, id, err) cclog.Errorf("NATS start job: adding tag to new job %d failed: %v", id, err)
return return
} }
} }
@@ -128,19 +230,24 @@ func (api *NatsAPI) handleStartJob(subject string, data []byte) {
} }
// handleStopJob processes job stop messages received via NATS. // handleStopJob processes job stop messages received via NATS.
// Expected JSON payload follows the StopJobAPIRequest structure. // The payload parameter contains JSON following the StopJobAPIRequest structure.
func (api *NatsAPI) handleStopJob(subject string, data []byte) { // The job is marked as stopped in the database and archiving is triggered if monitoring is enabled.
func (api *NatsAPI) handleStopJob(payload string) {
if payload == "" {
cclog.Error("NATS stop job: payload is empty")
return
}
var req StopJobAPIRequest var req StopJobAPIRequest
dec := json.NewDecoder(bytes.NewReader(data)) dec := json.NewDecoder(strings.NewReader(payload))
dec.DisallowUnknownFields() dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil { if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err) cclog.Errorf("NATS job stop: parsing request failed: %v", err)
return return
} }
if req.JobID == nil { if req.JobID == nil {
cclog.Errorf("NATS %s: the field 'jobId' is required", subject) cclog.Errorf("NATS job stop: the field 'jobId' is required")
return return
} }
@@ -148,28 +255,28 @@ func (api *NatsAPI) handleStopJob(subject string, data []byte) {
if err != nil { if err != nil {
cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime) cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
if cachedErr != nil { if cachedErr != nil {
cclog.Errorf("NATS %s: finding job failed: %v (cached lookup also failed: %v)", cclog.Errorf("NATS job stop: finding job failed: %v (cached lookup also failed: %v)",
subject, err, cachedErr) err, cachedErr)
return return
} }
job = cachedJob job = cachedJob
} }
if job.State != schema.JobStateRunning { if job.State != schema.JobStateRunning {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: job has already been stopped (state is: %s)", cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
subject, job.JobID, job.ID, job.Cluster, job.State) job.JobID, job.ID, job.Cluster, job.State)
return return
} }
if job.StartTime > req.StopTime { if job.StartTime > req.StopTime {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d", cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
subject, job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime) job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
return return
} }
if req.State != "" && !req.State.Valid() { if req.State != "" && !req.State.Valid() {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: invalid job state: %#v", cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: invalid job state: %#v",
subject, job.JobID, job.ID, job.Cluster, req.State) job.JobID, job.ID, job.Cluster, req.State)
return return
} else if req.State == "" { } else if req.State == "" {
req.State = schema.JobStateCompleted req.State = schema.JobStateCompleted
@@ -182,8 +289,8 @@ func (api *NatsAPI) handleStopJob(subject string, data []byte) {
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: marking job as '%s' failed: %v", cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
subject, job.JobID, job.ID, job.Cluster, job.State, err) job.JobID, job.ID, job.Cluster, job.State, err)
return return
} }
} }
@@ -198,15 +305,21 @@ func (api *NatsAPI) handleStopJob(subject string, data []byte) {
archiver.TriggerArchiving(job) archiver.TriggerArchiving(job)
} }
// handleNodeState processes node state update messages received via NATS. // processNodestateEvent extracts and processes node state data from the InfluxDB message.
// Expected JSON payload follows the UpdateNodeStatesRequest structure. // Updates node states in the repository for all nodes in the payload.
func (api *NatsAPI) handleNodeState(subject string, data []byte) { func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Nodestate event is missing event field with JSON payload")
return
}
var req UpdateNodeStatesRequest var req UpdateNodeStatesRequest
dec := json.NewDecoder(bytes.NewReader(data)) dec := json.NewDecoder(strings.NewReader(v))
dec.DisallowUnknownFields() dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil { if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err) cclog.Errorf("NATS nodestate: parsing request failed: %v", err)
return return
} }
@@ -224,8 +337,44 @@ func (api *NatsAPI) handleNodeState(subject string, data []byte) {
JobsRunning: node.JobsRunning, JobsRunning: node.JobsRunning,
} }
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState) if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
cclog.Errorf("NATS nodestate: updating node state for %s on %s failed: %v",
node.Hostname, req.Cluster, err)
}
} }
cclog.Debugf("NATS %s: updated %d node states for cluster %s", subject, len(req.Nodes), req.Cluster) cclog.Debugf("NATS nodestate: updated %d node states for cluster %s", len(req.Nodes), req.Cluster)
}
// handleNodeState processes node state update messages received via NATS using InfluxDB line protocol.
// The message must be in line protocol format with measurement="nodestate" and include:
// - field "event" containing JSON payload (UpdateNodeStatesRequest)
//
// Example: nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[...]}" 1234567890000000000
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
if len(data) == 0 {
cclog.Warnf("NATS %s: received empty message", subject)
return
}
d := influx.NewDecoderWithBytes(data)
for d.Next() {
m, err := receivers.DecodeInfluxMessage(d)
if err != nil {
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
return
}
if !m.IsEvent() {
cclog.Warnf("NATS %s: received non-event message, skipping", subject)
continue
}
if m.Name() == "nodestate" {
api.processNodestateEvent(m)
} else {
cclog.Warnf("NATS %s: unexpected measurement name '%s', expected 'nodestate'", subject, m.Name())
}
}
} }

947
internal/api/nats_test.go Normal file
View File

@@ -0,0 +1,947 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"context"
"database/sql"
"encoding/json"
"fmt"
"os"
"path/filepath"
"testing"
"time"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"github.com/ClusterCockpit/cc-lib/v2/schema"
_ "github.com/mattn/go-sqlite3"
)
func setupNatsTest(t *testing.T) *NatsAPI {
repository.ResetConnection()
const testconfig = `{
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"apiAllowedIPs": [
"*"
]
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2m"
}
}
}`
const testclusterJSON = `{
"name": "testcluster",
"subClusters": [
{
"name": "sc1",
"nodes": "host123,host124,host125",
"processorType": "Intel Core i7-4770",
"socketsPerNode": 1,
"coresPerSocket": 4,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"topology": {
"node": [0, 1, 2, 3, 4, 5, 6, 7],
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
"memoryDomain": [[0, 1, 2, 3, 4, 5, 6, 7]],
"die": [[0, 1, 2, 3, 4, 5, 6, 7]],
"core": [[0], [1], [2], [3], [4], [5], [6], [7]]
}
}
],
"metricConfig": [
{
"name": "load_one",
"unit": { "base": ""},
"scope": "node",
"timestep": 60,
"aggregation": "avg",
"peak": 8,
"normal": 0,
"caution": 0,
"alert": 0
}
]
}`
cclog.Init("info", true)
tmpdir := t.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
if err := os.Mkdir(jobarchive, 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
t.Fatal(err)
}
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
t.Fatal(err)
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
cfgFilePath := filepath.Join(tmpdir, "config.json")
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
t.Fatal(err)
}
ccconf.Init(cfgFilePath)
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
config.Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
repository.Connect("sqlite3", dbfilepath)
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
t.Fatal(err)
}
// metricstore initialization removed - it's initialized via callback in tests
archiver.Start(repository.GetJobRepository(), context.Background())
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
auth.Init(&cfg)
} else {
cclog.Warn("Authentication disabled due to missing configuration")
auth.Init(nil)
}
graph.Init()
return NewNatsAPI()
}
func cleanupNatsTest() {
if err := archiver.Shutdown(5 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
}
}
func TestNatsHandleStartJob(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
payload string
expectError bool
validateJob func(t *testing.T, job *schema.Job)
shouldFindJob bool
}{
{
name: "valid job start",
payload: `{
"jobId": 1001,
"user": "testuser1",
"project": "testproj1",
"cluster": "testcluster",
"partition": "main",
"walltime": 7200,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
}
],
"startTime": 1234567890
}`,
expectError: false,
shouldFindJob: true,
validateJob: func(t *testing.T, job *schema.Job) {
if job.JobID != 1001 {
t.Errorf("expected JobID 1001, got %d", job.JobID)
}
if job.User != "testuser1" {
t.Errorf("expected user testuser1, got %s", job.User)
}
if job.State != schema.JobStateRunning {
t.Errorf("expected state running, got %s", job.State)
}
},
},
{
name: "invalid JSON",
payload: `{
"jobId": "not a number",
"user": "testuser2"
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "missing required fields",
payload: `{
"jobId": 1002
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "job with unknown fields (should fail due to DisallowUnknownFields)",
payload: `{
"jobId": 1003,
"user": "testuser3",
"project": "testproj3",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"unknownField": "should cause error",
"startTime": 1234567900
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "job with tags",
payload: `{
"jobId": 1004,
"user": "testuser4",
"project": "testproj4",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"tags": [
{
"type": "test",
"name": "testtag",
"scope": "testuser4"
}
],
"startTime": 1234567910
}`,
expectError: false,
shouldFindJob: true,
validateJob: func(t *testing.T, job *schema.Job) {
if job.JobID != 1004 {
t.Errorf("expected JobID 1004, got %d", job.JobID)
}
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleStartJob(tt.payload)
natsAPI.JobRepository.SyncJobs()
// Allow some time for async operations
time.Sleep(100 * time.Millisecond)
if tt.shouldFindJob {
// Extract jobId from payload
var payloadMap map[string]any
json.Unmarshal([]byte(tt.payload), &payloadMap)
jobID := int64(payloadMap["jobId"].(float64))
cluster := payloadMap["cluster"].(string)
startTime := int64(payloadMap["startTime"].(float64))
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, &startTime)
if err != nil {
if !tt.expectError {
t.Fatalf("expected to find job, but got error: %v", err)
}
return
}
if tt.validateJob != nil {
tt.validateJob(t, job)
}
}
})
}
}
func TestNatsHandleStopJob(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
// First, create a running job
startPayload := `{
"jobId": 2001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
}
],
"startTime": 1234567890
}`
natsAPI.handleStartJob(startPayload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
tests := []struct {
name string
payload string
expectError bool
validateJob func(t *testing.T, job *schema.Job)
setupJobFunc func() // Optional: create specific test job
}{
{
name: "valid job stop - completed",
payload: `{
"jobId": 2001,
"cluster": "testcluster",
"startTime": 1234567890,
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: false,
validateJob: func(t *testing.T, job *schema.Job) {
if job.State != schema.JobStateCompleted {
t.Errorf("expected state completed, got %s", job.State)
}
expectedDuration := int32(1234571490 - 1234567890)
if job.Duration != expectedDuration {
t.Errorf("expected duration %d, got %d", expectedDuration, job.Duration)
}
},
},
{
name: "valid job stop - failed",
setupJobFunc: func() {
startPayloadFailed := `{
"jobId": 2002,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567900
}`
natsAPI.handleStartJob(startPayloadFailed)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2002,
"cluster": "testcluster",
"startTime": 1234567900,
"jobState": "failed",
"stopTime": 1234569900
}`,
expectError: false,
validateJob: func(t *testing.T, job *schema.Job) {
if job.State != schema.JobStateFailed {
t.Errorf("expected state failed, got %s", job.State)
}
},
},
{
name: "invalid JSON",
payload: `{
"jobId": "not a number"
}`,
expectError: true,
},
{
name: "missing jobId",
payload: `{
"cluster": "testcluster",
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: true,
},
{
name: "invalid job state",
setupJobFunc: func() {
startPayloadInvalid := `{
"jobId": 2003,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1]
}
],
"startTime": 1234567910
}`
natsAPI.handleStartJob(startPayloadInvalid)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2003,
"cluster": "testcluster",
"startTime": 1234567910,
"jobState": "invalid_state",
"stopTime": 1234571510
}`,
expectError: true,
},
{
name: "stopTime before startTime",
setupJobFunc: func() {
startPayloadTime := `{
"jobId": 2004,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0]
}
],
"startTime": 1234567920
}`
natsAPI.handleStartJob(startPayloadTime)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2004,
"cluster": "testcluster",
"startTime": 1234567920,
"jobState": "completed",
"stopTime": 1234567900
}`,
expectError: true,
},
{
name: "job not found",
payload: `{
"jobId": 99999,
"cluster": "testcluster",
"startTime": 1234567890,
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: true,
},
}
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
},
},
}
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.setupJobFunc != nil {
tt.setupJobFunc()
}
natsAPI.handleStopJob(tt.payload)
// Allow some time for async operations
time.Sleep(100 * time.Millisecond)
if !tt.expectError && tt.validateJob != nil {
// Extract job details from payload
var payloadMap map[string]any
json.Unmarshal([]byte(tt.payload), &payloadMap)
jobID := int64(payloadMap["jobId"].(float64))
cluster := payloadMap["cluster"].(string)
var startTime *int64
if st, ok := payloadMap["startTime"]; ok {
t := int64(st.(float64))
startTime = &t
}
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, startTime)
if err != nil {
t.Fatalf("expected to find job, but got error: %v", err)
}
tt.validateJob(t, job)
}
})
}
}
func TestNatsHandleNodeState(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
validateFn func(t *testing.T)
}{
{
name: "valid node state update",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"allocated\"],\"cpusAllocated\":8,\"memoryAllocated\":16384,\"gpusAllocated\":0,\"jobsRunning\":1}]}" 1234567890000000000`),
expectError: false,
validateFn: func(t *testing.T) {
// In a full test, we would verify the node state was updated in the database
// For now, just ensure no error occurred
},
},
{
name: "multiple nodes",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"host124\",\"states\":[\"allocated\"],\"cpusAllocated\":4,\"memoryAllocated\":8192,\"gpusAllocated\":1,\"jobsRunning\":1}]}" 1234567890000000000`),
expectError: false,
},
{
name: "invalid JSON in event field",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":\"not an array\"}" 1234567890000000000`),
expectError: true,
},
{
name: "empty nodes array",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1234567890000000000`),
expectError: false, // Empty array should not cause error
},
{
name: "invalid line protocol format",
data: []byte(`invalid line protocol format`),
expectError: true,
},
{
name: "empty data",
data: []byte(``),
expectError: false, // Should be handled gracefully with warning
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleNodeState("test.subject", tt.data)
// Allow some time for async operations
time.Sleep(50 * time.Millisecond)
if tt.validateFn != nil {
tt.validateFn(t)
}
})
}
}
func TestNatsProcessJobEvent(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
msgStartJob, err := lp.NewMessage(
"job",
map[string]string{"function": "start_job"},
nil,
map[string]any{
"event": `{
"jobId": 3001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567890
}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
msgMissingTag, err := lp.NewMessage(
"job",
map[string]string{},
nil,
map[string]any{
"event": `{}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
msgUnknownFunc, err := lp.NewMessage(
"job",
map[string]string{"function": "unknown_function"},
nil,
map[string]any{
"event": `{}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
tests := []struct {
name string
message lp.CCMessage
expectError bool
}{
{
name: "start_job function",
message: msgStartJob,
expectError: false,
},
{
name: "missing function tag",
message: msgMissingTag,
expectError: true,
},
{
name: "unknown function",
message: msgUnknownFunc,
expectError: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.processJobEvent(tt.message)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleJobEvent(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
}{
{
name: "valid influx line protocol",
data: []byte(`job,function=start_job event="{\"jobId\":4001,\"user\":\"testuser\",\"project\":\"testproj\",\"cluster\":\"testcluster\",\"partition\":\"main\",\"walltime\":3600,\"numNodes\":1,\"numHwthreads\":8,\"numAcc\":0,\"shared\":\"none\",\"monitoringStatus\":1,\"smt\":1,\"resources\":[{\"hostname\":\"host123\",\"hwthreads\":[0,1,2,3]}],\"startTime\":1234567890}" 1234567890000000000`),
expectError: false,
},
{
name: "invalid influx line protocol",
data: []byte(`invalid line protocol format`),
expectError: true,
},
{
name: "empty data",
data: []byte(``),
expectError: false, // Decoder should handle empty input gracefully
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// HandleJobEvent doesn't return errors, it logs them
// We're just ensuring it doesn't panic
natsAPI.handleJobEvent("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleJobEventEdgeCases(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
description string
}{
{
name: "non-event message (metric data)",
data: []byte(`job,function=start_job value=123.45 1234567890000000000`),
expectError: false,
description: "Should skip non-event messages gracefully",
},
{
name: "wrong measurement name",
data: []byte(`wrongmeasurement,function=start_job event="{}" 1234567890000000000`),
expectError: false,
description: "Should warn about unexpected measurement but not fail",
},
{
name: "missing event field",
data: []byte(`job,function=start_job other_field="value" 1234567890000000000`),
expectError: true,
description: "Should error when event field is missing",
},
{
name: "multiple measurements in one message",
data: []byte("job,function=start_job event=\"{}\" 1234567890000000000\njob,function=stop_job event=\"{}\" 1234567890000000000"),
expectError: false,
description: "Should process multiple lines",
},
{
name: "escaped quotes in JSON payload",
data: []byte(`job,function=start_job event="{\"jobId\":6001,\"user\":\"test\\\"user\",\"cluster\":\"test\"}" 1234567890000000000`),
expectError: true,
description: "Should handle escaped quotes (though JSON parsing may fail)",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleJobEvent("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleNodeStateEdgeCases(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
description string
}{
{
name: "missing cluster field in JSON",
data: []byte(`nodestate event="{\"nodes\":[]}" 1234567890000000000`),
expectError: true,
description: "Should fail when cluster is missing",
},
{
name: "malformed JSON with unescaped quotes",
data: []byte(`nodestate event="{\"cluster\":\"test"cluster\",\"nodes\":[]}" 1234567890000000000`),
expectError: true,
description: "Should fail on malformed JSON",
},
{
name: "unicode characters in hostname",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host-ñ123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
expectError: false,
description: "Should handle unicode characters",
},
{
name: "very large node count",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node1\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node2\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node3\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
expectError: false,
description: "Should handle multiple nodes efficiently",
},
{
name: "timestamp in past",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1000000000000000000`),
expectError: false,
description: "Should accept any valid timestamp",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleNodeState("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleStartJobDuplicatePrevention(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
// Start a job
payload := `{
"jobId": 5001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567890
}`
natsAPI.handleStartJob(payload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
// Try to start the same job again (within 24 hours)
duplicatePayload := `{
"jobId": 5001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567900
}`
natsAPI.handleStartJob(duplicatePayload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
// Verify only one job exists
jobID := int64(5001)
cluster := "testcluster"
jobs, err := natsAPI.JobRepository.FindAll(&jobID, &cluster, nil)
if err != nil && err != sql.ErrNoRows {
t.Fatalf("unexpected error: %v", err)
}
if len(jobs) != 1 {
t.Errorf("expected 1 job, got %d", len(jobs))
}
}

View File

@@ -12,7 +12,7 @@ import (
"time" "time"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
type UpdateNodeStatesRequest struct { type UpdateNodeStatesRequest struct {
@@ -47,7 +47,7 @@ func determineState(states []string) schema.SchedulerState {
// @description Required query-parameter defines if all users or only users with additional special roles are returned. // @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json // @produce json
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states" // @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
// @success 200 {object} api.DefaultApiResponse "Success message" // @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden" // @failure 403 {object} api.ErrorResponse "Forbidden"

View File

@@ -22,9 +22,9 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/util" "github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/gorilla/mux" "github.com/gorilla/mux"
) )
@@ -48,6 +48,7 @@ import (
const ( const (
noticeFilePath = "./var/notice.txt" noticeFilePath = "./var/notice.txt"
noticeFilePerms = 0o644 noticeFilePerms = 0o644
maxNoticeLength = 10000 // Maximum allowed notice content length in characters
) )
type RestAPI struct { type RestAPI struct {
@@ -61,6 +62,7 @@ type RestAPI struct {
RepositoryMutex sync.Mutex RepositoryMutex sync.Mutex
} }
// New creates and initializes a new RestAPI instance with configured dependencies.
func New() *RestAPI { func New() *RestAPI {
return &RestAPI{ return &RestAPI{
JobRepository: repository.GetJobRepository(), JobRepository: repository.GetJobRepository(),
@@ -69,6 +71,8 @@ func New() *RestAPI {
} }
} }
// MountAPIRoutes registers REST API endpoints for job and cluster management.
// These routes use JWT token authentication via the X-Auth-Token header.
func (api *RestAPI) MountAPIRoutes(r *mux.Router) { func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
r.StrictSlash(true) r.StrictSlash(true)
// REST API Uses TokenAuth // REST API Uses TokenAuth
@@ -103,6 +107,8 @@ func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
} }
} }
// MountUserAPIRoutes registers user-accessible REST API endpoints.
// These are limited endpoints for regular users with JWT token authentication.
func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) { func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) {
r.StrictSlash(true) r.StrictSlash(true)
// REST API Uses TokenAuth // REST API Uses TokenAuth
@@ -112,6 +118,8 @@ func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) {
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
} }
// MountMetricStoreAPIRoutes registers metric storage API endpoints.
// These endpoints handle metric data ingestion and health checks with JWT token authentication.
func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) { func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
// REST API Uses TokenAuth // REST API Uses TokenAuth
// Note: StrictSlash handles trailing slash variations automatically // Note: StrictSlash handles trailing slash variations automatically
@@ -126,6 +134,8 @@ func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
r.HandleFunc("/api/healthcheck/", metricsHealth).Methods(http.MethodGet) r.HandleFunc("/api/healthcheck/", metricsHealth).Methods(http.MethodGet)
} }
// MountConfigAPIRoutes registers configuration and user management endpoints.
// These routes use session-based authentication and require admin privileges.
func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) { func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) {
r.StrictSlash(true) r.StrictSlash(true)
// Settings Frontend Uses SessionAuth // Settings Frontend Uses SessionAuth
@@ -139,6 +149,8 @@ func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) {
} }
} }
// MountFrontendAPIRoutes registers frontend-specific API endpoints.
// These routes support JWT generation and user configuration updates with session authentication.
func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) { func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) {
r.StrictSlash(true) r.StrictSlash(true)
// Settings Frontend Uses SessionAuth // Settings Frontend Uses SessionAuth
@@ -160,6 +172,8 @@ type DefaultAPIResponse struct {
Message string `json:"msg"` Message string `json:"msg"`
} }
// handleError writes a standardized JSON error response with the given status code.
// It logs the error at WARN level and ensures proper Content-Type headers are set.
func handleError(err error, statusCode int, rw http.ResponseWriter) { func handleError(err error, statusCode int, rw http.ResponseWriter) {
cclog.Warnf("REST ERROR : %s", err.Error()) cclog.Warnf("REST ERROR : %s", err.Error())
rw.Header().Add("Content-Type", "application/json") rw.Header().Add("Content-Type", "application/json")
@@ -172,15 +186,38 @@ func handleError(err error, statusCode int, rw http.ResponseWriter) {
} }
} }
// decode reads JSON from r into val with strict validation that rejects unknown fields.
func decode(r io.Reader, val any) error { func decode(r io.Reader, val any) error {
dec := json.NewDecoder(r) dec := json.NewDecoder(r)
dec.DisallowUnknownFields() dec.DisallowUnknownFields()
return dec.Decode(val) return dec.Decode(val)
} }
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) { // validatePathComponent checks if a path component contains potentially malicious patterns
// SecuredCheck() only worked with TokenAuth: Removed // that could be used for path traversal attacks. Returns an error if validation fails.
func validatePathComponent(component, componentName string) error {
if strings.Contains(component, "..") ||
strings.Contains(component, "/") ||
strings.Contains(component, "\\") {
return fmt.Errorf("invalid %s", componentName)
}
return nil
}
// editNotice godoc
// @summary Update system notice
// @tags Config
// @description Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.
// @accept mpfd
// @produce plain
// @param new-content formData string true "New notice content (max 10000 characters)"
// @success 200 {string} string "Update Notice Content Success"
// @failure 400 {object} ErrorResponse "Bad Request"
// @failure 403 {object} ErrorResponse "Forbidden"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /notice/ [post]
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to update the notice.txt file"), http.StatusForbidden, rw) handleError(fmt.Errorf("only admins are allowed to update the notice.txt file"), http.StatusForbidden, rw)
return return
@@ -189,9 +226,8 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
// Get Value // Get Value
newContent := r.FormValue("new-content") newContent := r.FormValue("new-content")
// Validate content length to prevent DoS if len(newContent) > maxNoticeLength {
if len(newContent) > 10000 { handleError(fmt.Errorf("notice content exceeds maximum length of %d characters", maxNoticeLength), http.StatusBadRequest, rw)
handleError(fmt.Errorf("notice content exceeds maximum length of 10000 characters"), http.StatusBadRequest, rw)
return return
} }
@@ -203,7 +239,9 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
handleError(fmt.Errorf("creating notice file failed: %w", err), http.StatusInternalServerError, rw) handleError(fmt.Errorf("creating notice file failed: %w", err), http.StatusInternalServerError, rw)
return return
} }
ntxt.Close() if err := ntxt.Close(); err != nil {
cclog.Warnf("Failed to close notice file: %v", err)
}
} }
if err := os.WriteFile(noticeFilePath, []byte(newContent), noticeFilePerms); err != nil { if err := os.WriteFile(noticeFilePath, []byte(newContent), noticeFilePerms); err != nil {
@@ -213,13 +251,30 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain") rw.Header().Set("Content-Type", "text/plain")
rw.WriteHeader(http.StatusOK) rw.WriteHeader(http.StatusOK)
var msg []byte
if newContent != "" { if newContent != "" {
rw.Write([]byte("Update Notice Content Success")) msg = []byte("Update Notice Content Success")
} else { } else {
rw.Write([]byte("Empty Notice Content Success")) msg = []byte("Empty Notice Content Success")
}
if _, err := rw.Write(msg); err != nil {
cclog.Errorf("Failed to write response: %v", err)
} }
} }
// getJWT godoc
// @summary Generate JWT token
// @tags Frontend
// @description Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.
// @accept mpfd
// @produce plain
// @param username formData string true "Username to generate JWT for"
// @success 200 {string} string "JWT token"
// @failure 403 {object} ErrorResponse "Forbidden"
// @failure 404 {object} ErrorResponse "User Not Found"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /jwt/ [get]
func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) { func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain") rw.Header().Set("Content-Type", "text/plain")
username := r.FormValue("username") username := r.FormValue("username")
@@ -244,12 +299,22 @@ func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
} }
rw.WriteHeader(http.StatusOK) rw.WriteHeader(http.StatusOK)
rw.Write([]byte(jwt)) if _, err := rw.Write([]byte(jwt)); err != nil {
cclog.Errorf("Failed to write JWT response: %v", err)
}
} }
// getRoles godoc
// @summary Get available roles
// @tags Config
// @description Returns a list of valid user roles. Only admins are allowed.
// @produce json
// @success 200 {array} string "List of role names"
// @failure 403 {object} ErrorResponse "Forbidden"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /roles/ [get]
func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) { func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
user := repository.GetUserFromContext(r.Context()) user := repository.GetUserFromContext(r.Context())
if !user.HasRole(schema.RoleAdmin) { if !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to fetch a list of roles"), http.StatusForbidden, rw) handleError(fmt.Errorf("only admins are allowed to fetch a list of roles"), http.StatusForbidden, rw)
@@ -268,6 +333,18 @@ func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
} }
} }
// updateConfiguration godoc
// @summary Update user configuration
// @tags Frontend
// @description Updates a user's configuration key-value pair.
// @accept mpfd
// @produce plain
// @param key formData string true "Configuration key"
// @param value formData string true "Configuration value"
// @success 200 {string} string "success"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /configuration/ [post]
func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request) { func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain") rw.Header().Set("Content-Type", "text/plain")
key, value := r.FormValue("key"), r.FormValue("value") key, value := r.FormValue("key"), r.FormValue("value")
@@ -278,9 +355,25 @@ func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request)
} }
rw.WriteHeader(http.StatusOK) rw.WriteHeader(http.StatusOK)
rw.Write([]byte("success")) if _, err := rw.Write([]byte("success")); err != nil {
cclog.Errorf("Failed to write response: %v", err)
}
} }
// putMachineState godoc
// @summary Store machine state
// @tags Machine State
// @description Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
// @accept json
// @produce plain
// @param cluster path string true "Cluster name"
// @param host path string true "Host name"
// @success 201 "Created"
// @failure 400 {object} ErrorResponse "Bad Request"
// @failure 404 {object} ErrorResponse "Machine state not enabled"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /machine_state/{cluster}/{host} [put]
func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) { func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" { if api.MachineStateDir == "" {
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw) handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
@@ -291,13 +384,12 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
cluster := vars["cluster"] cluster := vars["cluster"]
host := vars["host"] host := vars["host"]
// Validate cluster and host to prevent path traversal attacks if err := validatePathComponent(cluster, "cluster name"); err != nil {
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") { handleError(err, http.StatusBadRequest, rw)
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
return return
} }
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") { if err := validatePathComponent(host, "host name"); err != nil {
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw) handleError(err, http.StatusBadRequest, rw)
return return
} }
@@ -323,6 +415,18 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
rw.WriteHeader(http.StatusCreated) rw.WriteHeader(http.StatusCreated)
} }
// getMachineState godoc
// @summary Retrieve machine state
// @tags Machine State
// @description Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
// @produce json
// @param cluster path string true "Cluster name"
// @param host path string true "Host name"
// @success 200 {object} object "Machine state JSON data"
// @failure 400 {object} ErrorResponse "Bad Request"
// @failure 404 {object} ErrorResponse "Machine state not enabled or file not found"
// @security ApiKeyAuth
// @router /machine_state/{cluster}/{host} [get]
func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) { func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" { if api.MachineStateDir == "" {
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw) handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
@@ -333,13 +437,12 @@ func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) {
cluster := vars["cluster"] cluster := vars["cluster"]
host := vars["host"] host := vars["host"]
// Validate cluster and host to prevent path traversal attacks if err := validatePathComponent(cluster, "cluster name"); err != nil {
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") { handleError(err, http.StatusBadRequest, rw)
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
return return
} }
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") { if err := validatePathComponent(host, "host name"); err != nil {
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw) handleError(err, http.StatusBadRequest, rw)
return return
} }

View File

@@ -11,8 +11,8 @@ import (
"net/http" "net/http"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/gorilla/mux" "github.com/gorilla/mux"
) )
@@ -31,7 +31,7 @@ type APIReturnedUser struct {
// @description Required query-parameter defines if all users or only users with additional special roles are returned. // @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json // @produce json
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles" // @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
// @success 200 {array} api.ApiReturnedUser "List of users returned successfully" // @success 200 {array} api.APIReturnedUser "List of users returned successfully"
// @failure 400 {string} string "Bad Request" // @failure 400 {string} string "Bad Request"
// @failure 401 {string} string "Unauthorized" // @failure 401 {string} string "Unauthorized"
// @failure 403 {string} string "Forbidden" // @failure 403 {string} string "Forbidden"

View File

@@ -106,7 +106,7 @@ Data is archived at the highest available resolution (typically 60s intervals).
```go ```go
// In archiver.go ArchiveJob() function // In archiver.go ArchiveJob() function
jobData, err := metricdispatcher.LoadData(job, allMetrics, scopes, ctx, 300) jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 300)
// 0 = highest resolution // 0 = highest resolution
// 300 = 5-minute resolution // 300 = 5-minute resolution
``` ```
@@ -185,6 +185,6 @@ Internal state is protected by:
## Dependencies ## Dependencies
- `internal/repository`: Database operations for job metadata - `internal/repository`: Database operations for job metadata
- `internal/metricdispatcher`: Loading metric data from various backends - `internal/metricdispatch`: Loading metric data from various backends
- `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite) - `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
- `cc-lib/schema`: Job and metric data structures - `cc-lib/schema`: Job and metric data structures

View File

@@ -54,8 +54,8 @@ import (
"time" "time"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
sq "github.com/Masterminds/squirrel" sq "github.com/Masterminds/squirrel"
) )

View File

@@ -10,10 +10,10 @@ import (
"math" "math"
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher" "github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
// ArchiveJob archives a completed job's metric data to the configured archive backend. // ArchiveJob archives a completed job's metric data to the configured archive backend.
@@ -60,7 +60,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
scopes = append(scopes, schema.MetricScopeAccelerator) scopes = append(scopes, schema.MetricScopeAccelerator)
} }
jobData, err := metricdispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s) jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
if err != nil { if err != nil {
cclog.Error("Error wile loading job data for archiving") cclog.Error("Error wile loading job data for archiving")
return nil, err return nil, err

View File

@@ -25,9 +25,9 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/util" "github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/gorilla/sessions" "github.com/gorilla/sessions"
) )
@@ -616,9 +616,9 @@ func securedCheck(user *schema.User, r *http.Request) error {
} }
// If SplitHostPort fails, IPAddress is already just a host (no port) // If SplitHostPort fails, IPAddress is already just a host (no port)
// If nothing declared in config: deny all request to this api endpoint // If nothing declared in config: Continue
if len(config.Keys.APIAllowedIPs) == 0 { if len(config.Keys.APIAllowedIPs) == 0 {
return fmt.Errorf("missing configuration key ApiAllowedIPs") return nil
} }
// If wildcard declared in config: Continue // If wildcard declared in config: Continue
if config.Keys.APIAllowedIPs[0] == "*" { if config.Keys.APIAllowedIPs[0] == "*" {

View File

@@ -14,8 +14,8 @@ import (
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5" "github.com/golang-jwt/jwt/v5"
) )

View File

@@ -12,8 +12,8 @@ import (
"net/http" "net/http"
"os" "os"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5" "github.com/golang-jwt/jwt/v5"
) )

View File

@@ -11,8 +11,8 @@ import (
"fmt" "fmt"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5" "github.com/golang-jwt/jwt/v5"
) )

View File

@@ -8,7 +8,7 @@ package auth
import ( import (
"testing" "testing"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5" "github.com/golang-jwt/jwt/v5"
) )
@@ -237,7 +237,6 @@ func TestGetUserFromJWT_NoValidation(t *testing.T) {
} }
user, err := getUserFromJWT(claims, false, schema.AuthToken, -1) user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
if err != nil { if err != nil {
t.Fatalf("Unexpected error: %v", err) t.Fatalf("Unexpected error: %v", err)
} }

View File

@@ -13,8 +13,8 @@ import (
"os" "os"
"strings" "strings"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5" "github.com/golang-jwt/jwt/v5"
) )

View File

@@ -13,8 +13,8 @@ import (
"strings" "strings"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-ldap/ldap/v3" "github.com/go-ldap/ldap/v3"
) )

View File

@@ -9,8 +9,8 @@ import (
"fmt" "fmt"
"net/http" "net/http"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"golang.org/x/crypto/bcrypt" "golang.org/x/crypto/bcrypt"
) )

View File

@@ -15,8 +15,8 @@ import (
"time" "time"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/coreos/go-oidc/v3/oidc" "github.com/coreos/go-oidc/v3/oidc"
"github.com/gorilla/mux" "github.com/gorilla/mux"
"golang.org/x/oauth2" "golang.org/x/oauth2"

View File

@@ -11,8 +11,8 @@ import (
"encoding/json" "encoding/json"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/resampler" "github.com/ClusterCockpit/cc-lib/v2/resampler"
) )
type ProgramConfig struct { type ProgramConfig struct {
@@ -78,9 +78,6 @@ type ProgramConfig struct {
// If exists, will enable dynamic zoom in frontend metric plots using the configured values // If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"resampling"` EnableResampling *ResampleConfig `json:"resampling"`
// Global upstream metric repository configuration for metric pull workers
UpstreamMetricRepository *json.RawMessage `json:"upstreamMetricRepository,omitempty"`
} }
type ResampleConfig struct { type ResampleConfig struct {
@@ -93,8 +90,7 @@ type ResampleConfig struct {
} }
type NATSConfig struct { type NATSConfig struct {
SubjectJobStart string `json:"subjectJobStart"` SubjectJobEvent string `json:"subjectJobEvent"`
SubjectJobStop string `json:"subjectJobStop"`
SubjectNodeState string `json:"subjectNodeState"` SubjectNodeState string `json:"subjectNodeState"`
} }
@@ -115,13 +111,6 @@ type FilterRanges struct {
StartTime *TimeRange `json:"startTime"` StartTime *TimeRange `json:"startTime"`
} }
type ClusterConfig struct {
Name string `json:"name"`
FilterRanges *FilterRanges `json:"filterRanges"`
}
var Clusters []*ClusterConfig
var Keys ProgramConfig = ProgramConfig{ var Keys ProgramConfig = ProgramConfig{
Addr: "localhost:8080", Addr: "localhost:8080",
DisableAuthentication: false, DisableAuthentication: false,
@@ -135,7 +124,7 @@ var Keys ProgramConfig = ProgramConfig{
ShortRunningJobsDuration: 5 * 60, ShortRunningJobsDuration: 5 * 60,
} }
func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) { func Init(mainConfig json.RawMessage) {
Validate(configSchema, mainConfig) Validate(configSchema, mainConfig)
dec := json.NewDecoder(bytes.NewReader(mainConfig)) dec := json.NewDecoder(bytes.NewReader(mainConfig))
dec.DisallowUnknownFields() dec.DisallowUnknownFields()
@@ -143,17 +132,6 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error()) cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
} }
Validate(clustersSchema, clusterConfig)
dec = json.NewDecoder(bytes.NewReader(clusterConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&Clusters); err != nil {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
}
if len(Clusters) < 1 {
cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
}
if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 { if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints) resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
} }

View File

@@ -8,19 +8,15 @@ package config
import ( import (
"testing" "testing"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
) )
func TestInit(t *testing.T) { func TestInit(t *testing.T) {
fp := "../../configs/config.json" fp := "../../configs/config.json"
ccconf.Init(fp) ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil { if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { Init(cfg)
Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
} else { } else {
cclog.Abort("Main configuration must be present") cclog.Abort("Main configuration must be present")
} }
@@ -34,11 +30,7 @@ func TestInitMinimal(t *testing.T) {
fp := "../../configs/config-demo.json" fp := "../../configs/config-demo.json"
ccconf.Init(fp) ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil { if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { Init(cfg)
Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
} else { } else {
cclog.Abort("Main configuration must be present") cclog.Abort("Main configuration must be present")
} }

View File

@@ -6,7 +6,7 @@
package config package config
var configSchema = ` var configSchema = `
{ {
"type": "object", "type": "object",
"properties": { "properties": {
"addr": { "addr": {
@@ -120,103 +120,20 @@ var configSchema = `
}, },
"required": ["trigger", "resolutions"] "required": ["trigger", "resolutions"]
}, },
"upstreamMetricRepository": { "apiSubjects": {
"description": "Global upstream metric repository configuration for metric pull workers", "description": "NATS subjects configuration for subscribing to job and node events.",
"type": "object", "type": "object",
"properties": { "properties": {
"kind": { "subjectJobEvent": {
"type": "string", "description": "NATS subject for job events (start_job, stop_job)",
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
},
"url": {
"type": "string" "type": "string"
}, },
"token": { "subjectNodeState": {
"description": "NATS subject for node state updates",
"type": "string" "type": "string"
} }
}, },
"required": ["kind"] "required": ["subjectJobEvent", "subjectNodeState"]
} }
},
"required": ["apiAllowedIPs"]
}`
var clustersSchema = `
{
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "The name of the cluster.",
"type": "string"
},
"metricDataRepository": {
"description": "Type of the metric data repository for this cluster",
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
},
"url": {
"type": "string"
},
"token": {
"type": "string"
} }
}, }`
"required": ["kind"]
},
"filterRanges": {
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
"type": "object",
"properties": {
"numNodes": {
"description": "UI slider range for number of nodes",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": ["from", "to"]
},
"duration": {
"description": "UI slider range for duration",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": ["from", "to"]
},
"startTime": {
"description": "UI slider range for start time",
"type": "object",
"properties": {
"from": {
"type": "string",
"format": "date-time"
},
"to": {
"type": "null"
}
},
"required": ["from", "to"]
}
},
"required": ["numNodes", "duration", "startTime"]
}
},
"required": ["name", "filterRanges"],
"minItems": 1
}
}`

View File

@@ -8,7 +8,7 @@ package config
import ( import (
"encoding/json" "encoding/json"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/santhosh-tekuri/jsonschema/v5" "github.com/santhosh-tekuri/jsonschema/v5"
) )

File diff suppressed because it is too large Load Diff

View File

@@ -10,7 +10,7 @@ import (
"time" "time"
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
type ClusterMetricWithName struct { type ClusterMetricWithName struct {
@@ -82,6 +82,7 @@ type JobFilter struct {
State []schema.JobState `json:"state,omitempty"` State []schema.JobState `json:"state,omitempty"`
MetricStats []*MetricStatItem `json:"metricStats,omitempty"` MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
Shared *string `json:"shared,omitempty"` Shared *string `json:"shared,omitempty"`
Schedule *string `json:"schedule,omitempty"`
Node *StringInput `json:"node,omitempty"` Node *StringInput `json:"node,omitempty"`
} }

View File

@@ -4,7 +4,7 @@ import (
"sync" "sync"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )

View File

@@ -3,7 +3,7 @@ package graph
// This file will be automatically regenerated based on the schema, any resolver // This file will be automatically regenerated based on the schema, any resolver
// implementations // implementations
// will be copied through when generating and any unknown code will be moved to the end. // will be copied through when generating and any unknown code will be moved to the end.
// Code generated by github.com/99designs/gqlgen version v0.17.84 // Code generated by github.com/99designs/gqlgen version v0.17.85
import ( import (
"context" "context"
@@ -19,11 +19,11 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated" "github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher" "github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
// Partitions is the resolver for the partitions field. // Partitions is the resolver for the partitions field.
@@ -283,7 +283,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
// Test Access: Admins && Admin Tag OR Everyone && Private Tag // Test Access: Admins && Admin Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope { if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
// Remove from DB // Remove from DB
if err = r.Repo.RemoveTagById(tid); err != nil { if err = r.Repo.RemoveTagByID(tid); err != nil {
cclog.Warn("Error while removing tag") cclog.Warn("Error while removing tag")
return nil, err return nil, err
} else { } else {
@@ -484,7 +484,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
return nil, err return nil, err
} }
data, err := metricdispatcher.LoadData(job, metrics, scopes, ctx, *resolution) data, err := metricdispatch.LoadData(job, metrics, scopes, ctx, *resolution)
if err != nil { if err != nil {
cclog.Warn("Error while loading job data") cclog.Warn("Error while loading job data")
return nil, err return nil, err
@@ -512,7 +512,7 @@ func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []strin
return nil, err return nil, err
} }
data, err := metricdispatcher.LoadJobStats(job, metrics, ctx) data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
if err != nil { if err != nil {
cclog.Warnf("Error while loading jobStats data for job id %s", id) cclog.Warnf("Error while loading jobStats data for job id %s", id)
return nil, err return nil, err
@@ -537,7 +537,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
return nil, err return nil, err
} }
data, err := metricdispatcher.LoadScopedJobStats(job, metrics, scopes, ctx) data, err := metricdispatch.LoadScopedJobStats(job, metrics, scopes, ctx)
if err != nil { if err != nil {
cclog.Warnf("Error while loading scopedJobStats data for job id %s", id) cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
return nil, err return nil, err
@@ -590,10 +590,13 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean. // Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
// Users can decide in frontend to use continuous scroll, even if app-default is paging! // Users can decide in frontend to use continuous scroll, even if app-default is paging!
// Skip if page.ItemsPerPage == -1 ("Load All" -> No Next Page required, Status Dashboards)
/* /*
Example Page 4 @ 10 IpP : Does item 41 exist? Example Page 4 @ 10 IpP : Does item 41 exist?
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists. Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
*/ */
hasNextPage := false
if page.ItemsPerPage != -1 {
nextPage := &model.PageRequest{ nextPage := &model.PageRequest{
ItemsPerPage: 1, ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1), Page: ((page.Page * page.ItemsPerPage) + 1),
@@ -603,8 +606,8 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
cclog.Warn("Error while querying next jobs") cclog.Warn("Error while querying next jobs")
return nil, err return nil, err
} }
hasNextPage = len(nextJobs) == 1
hasNextPage := len(nextJobs) == 1 }
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
} }
@@ -702,7 +705,7 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job
res := []*model.JobStats{} res := []*model.JobStats{}
for _, job := range jobs { for _, job := range jobs {
data, err := metricdispatcher.LoadJobStats(job, metrics, ctx) data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
if err != nil { if err != nil {
cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID) cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
continue continue
@@ -753,13 +756,19 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return nil, errors.New("you need to be administrator or support staff for this query") return nil, errors.New("you need to be administrator or support staff for this query")
} }
if metrics == nil { defaultMetrics := make([]string, 0)
for _, mc := range archive.GetCluster(cluster).MetricConfig { for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name) defaultMetrics = append(defaultMetrics, mc.Name)
} }
if metrics == nil {
metrics = defaultMetrics
} else {
metrics = slices.DeleteFunc(metrics, func(metric string) bool {
return !slices.Contains(defaultMetrics, metric) // Remove undefined metrics.
})
} }
data, err := metricdispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) data, err := metricdispatch.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil { if err != nil {
cclog.Warn("error while loading node data") cclog.Warn("error while loading node data")
return nil, err return nil, err
@@ -825,7 +834,7 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
} }
} }
data, err := metricdispatcher.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx) data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
if err != nil { if err != nil {
cclog.Warn("error while loading node data (Resolver.NodeMetricsList") cclog.Warn("error while loading node data (Resolver.NodeMetricsList")
return nil, err return nil, err
@@ -880,7 +889,7 @@ func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metr
// 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow // 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow
scopes := []schema.MetricScope{"node"} scopes := []schema.MetricScope{"node"}
data, err := metricdispatcher.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx) data, err := metricdispatch.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
if err != nil { if err != nil {
cclog.Warn("error while loading node data") cclog.Warn("error while loading node data")
return nil, err return nil, err
@@ -972,12 +981,10 @@ func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
// SubCluster returns generated.SubClusterResolver implementation. // SubCluster returns generated.SubClusterResolver implementation.
func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} } func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} }
type ( type clusterResolver struct{ *Resolver }
clusterResolver struct{ *Resolver } type jobResolver struct{ *Resolver }
jobResolver struct{ *Resolver } type metricValueResolver struct{ *Resolver }
metricValueResolver struct{ *Resolver } type mutationResolver struct{ *Resolver }
mutationResolver struct{ *Resolver } type nodeResolver struct{ *Resolver }
nodeResolver struct{ *Resolver } type queryResolver struct{ *Resolver }
queryResolver struct{ *Resolver } type subClusterResolver struct{ *Resolver }
subClusterResolver struct{ *Resolver }
)

View File

@@ -13,9 +13,9 @@ import (
"github.com/99designs/gqlgen/graphql" "github.com/99designs/gqlgen/graphql"
"github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher" "github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
const MAX_JOBS_FOR_ANALYSIS = 500 const MAX_JOBS_FOR_ANALYSIS = 500
@@ -55,7 +55,7 @@ func (r *queryResolver) rooflineHeatmap(
// resolution = max(resolution, mc.Timestep) // resolution = max(resolution, mc.Timestep)
// } // }
jobdata, err := metricdispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0) jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
if err != nil { if err != nil {
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID) cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
return nil, err return nil, err
@@ -128,7 +128,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
continue continue
} }
if err := metricdispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil { if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil {
cclog.Error("Error while loading averages for footprint") cclog.Error("Error while loading averages for footprint")
return nil, err return nil, err
} }

View File

@@ -2,6 +2,7 @@
// All rights reserved. This file is part of cc-backend. // All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package importer package importer
import ( import (
@@ -14,8 +15,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
// HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string. // HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.

View File

@@ -16,8 +16,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/importer" "github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
) )
// copyFile copies a file from source path to destination path. // copyFile copies a file from source path to destination path.
@@ -56,33 +56,8 @@ func setup(t *testing.T) *repository.JobRepository {
"archive": { "archive": {
"kind": "file", "kind": "file",
"path": "./var/job-archive" "path": "./var/job-archive"
},
"clusters": [
{
"name": "testcluster",
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
} }
}, }`
{
"name": "fritz",
"filterRanges": {
"numNodes": { "from": 1, "to": 944 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
},
{
"name": "taurus",
"filterRanges": {
"numNodes": { "from": 1, "to": 4000 },
"duration": { "from": 0, "to": 604800 },
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
}
}
]}`
cclog.Init("info", true) cclog.Init("info", true)
tmpdir := t.TempDir() tmpdir := t.TempDir()
@@ -118,11 +93,7 @@ func setup(t *testing.T) *repository.JobRepository {
// Load and check main configuration // Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil { if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { config.Init(cfg)
config.Init(cfg, clustercfg)
} else {
t.Fatal("Cluster configuration must be present")
}
} else { } else {
t.Fatal("Main configuration must be present") t.Fatal("Main configuration must be present")
} }

View File

@@ -22,8 +22,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
const ( const (

View File

@@ -2,12 +2,13 @@
// All rights reserved. This file is part of cc-backend. // All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package importer package importer
import ( import (
"math" "math"
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits" ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
) )
// getNormalizationFactor calculates the scaling factor needed to normalize a value // getNormalizationFactor calculates the scaling factor needed to normalize a value

View File

@@ -8,7 +8,7 @@ import (
"fmt" "fmt"
"testing" "testing"
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits" ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
) )
// TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix. // TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix.

View File

@@ -1,190 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"errors"
"sync"
"github.com/ClusterCockpit/cc-lib/schema"
)
// BufferCap is the default buffer capacity.
// buffer.data will only ever grow up to its capacity and a new link
// in the buffer chain will be created if needed so that no copying
// of data or reallocation needs to happen on writes.
const BufferCap int = DefaultBufferCapacity
var bufferPool sync.Pool = sync.Pool{
New: func() any {
return &buffer{
data: make([]schema.Float, 0, BufferCap),
}
},
}
var (
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
)
// Each metric on each level has it's own buffer.
// This is where the actual values go.
// If `cap(data)` is reached, a new buffer is created and
// becomes the new head of a buffer list.
type buffer struct {
prev *buffer
next *buffer
data []schema.Float
frequency int64
start int64
archived bool
closed bool
}
func newBuffer(ts, freq int64) *buffer {
b := bufferPool.Get().(*buffer)
b.frequency = freq
b.start = ts - (freq / 2)
b.prev = nil
b.next = nil
b.archived = false
b.closed = false
b.data = b.data[:0]
return b
}
// If a new buffer was created, the new head is returnd.
// Otherwise, the existing buffer is returnd.
// Normaly, only "newer" data should be written, but if the value would
// end up in the same buffer anyways it is allowed.
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
if ts < b.start {
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
}
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
idx := int((ts - b.start) / b.frequency)
if idx >= cap(b.data) {
newbuf := newBuffer(ts, b.frequency)
newbuf.prev = b
b.next = newbuf
b = newbuf
idx = 0
}
// Overwriting value or writing value from past
if idx < len(b.data) {
b.data[idx] = value
return b, nil
}
// Fill up unwritten slots with NaN
for i := len(b.data); i < idx; i++ {
b.data = append(b.data, schema.NaN)
}
b.data = append(b.data, value)
return b, nil
}
func (b *buffer) end() int64 {
return b.firstWrite() + int64(len(b.data))*b.frequency
}
func (b *buffer) firstWrite() int64 {
return b.start + (b.frequency / 2)
}
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
// Simple linear interpolation is done between the two neighboring cells if possible.
// If values at the start or end are missing, instead of NaN values, the second and thrid
// return values contain the actual `from`/`to`.
// This function goes back the buffer chain if `from` is older than the currents buffer start.
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
// If `data` is not long enough to hold all values, this function will panic!
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
if from < b.firstWrite() {
if b.prev != nil {
return b.prev.read(from, to, data)
}
from = b.firstWrite()
}
i := 0
t := from
for ; t < to; t += b.frequency {
idx := int((t - b.start) / b.frequency)
if idx >= cap(b.data) {
if b.next == nil {
break
}
b = b.next
idx = 0
}
if idx >= len(b.data) {
if b.next == nil || to <= b.next.start {
break
}
data[i] += schema.NaN
} else if t < b.start {
data[i] += schema.NaN
} else {
data[i] += b.data[idx]
}
i++
}
return data[:i], from, t, nil
}
// Returns true if this buffer needs to be freed.
func (b *buffer) free(t int64) (delme bool, n int) {
if b.prev != nil {
delme, m := b.prev.free(t)
n += m
if delme {
b.prev.next = nil
if cap(b.prev.data) == BufferCap {
bufferPool.Put(b.prev)
}
b.prev = nil
}
}
end := b.end()
if end < t {
return true, n + 1
}
return false, n
}
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
if b == nil {
return nil
}
if err := b.prev.iterFromTo(from, to, callback); err != nil {
return err
}
if from <= b.end() && b.start <= to {
return callback(b)
}
return nil
}
func (b *buffer) count() int64 {
res := int64(len(b.data))
if b.prev != nil {
res += b.prev.count()
}
return res
}

View File

@@ -1,115 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"fmt"
"time"
)
const (
DefaultMaxWorkers = 10
DefaultBufferCapacity = 512
DefaultGCTriggerInterval = 100
DefaultAvroWorkers = 4
DefaultCheckpointBufferMin = 3
DefaultAvroCheckpointInterval = time.Minute
)
type MetricStoreConfig struct {
// Number of concurrent workers for checkpoint and archive operations.
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
NumWorkers int `json:"num-workers"`
Checkpoints struct {
FileFormat string `json:"file-format"`
Interval string `json:"interval"`
RootDir string `json:"directory"`
Restore string `json:"restore"`
} `json:"checkpoints"`
Debug struct {
DumpToFile string `json:"dump-to-file"`
EnableGops bool `json:"gops"`
} `json:"debug"`
RetentionInMemory string `json:"retention-in-memory"`
Archive struct {
Interval string `json:"interval"`
RootDir string `json:"directory"`
DeleteInstead bool `json:"delete-instead"`
} `json:"archive"`
Subscriptions []struct {
// Channel name
SubscribeTo string `json:"subscribe-to"`
// Allow lines without a cluster tag, use this as default, optional
ClusterTag string `json:"cluster-tag"`
} `json:"subscriptions"`
}
var Keys MetricStoreConfig
// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time!
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota
SumAggregation
AvgAggregation
)
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
switch str {
case "":
return NoAggregation, nil
case "sum":
return SumAggregation, nil
case "avg":
return AvgAggregation, nil
default:
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
}
}
type MetricConfig struct {
// Interval in seconds at which measurements are stored
Frequency int64
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy
// Private, used internally...
offset int
}
var Metrics map[string]MetricConfig
func GetMetricFrequency(metricName string) (int64, error) {
if metric, ok := Metrics[metricName]; ok {
return metric.Frequency, nil
}
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
}
// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency.
// use metric.Name to check if the metric already exists.
// if not, add it to the Metrics map.
func AddMetric(name string, metric MetricConfig) error {
if Metrics == nil {
Metrics = make(map[string]MetricConfig, 0)
}
if existingMetric, ok := Metrics[name]; ok {
if existingMetric.Frequency != metric.Frequency {
if existingMetric.Frequency < metric.Frequency {
existingMetric.Frequency = metric.Frequency
Metrics[name] = existingMetric
}
}
} else {
Metrics[name] = metric
}
return nil
}

View File

@@ -1,95 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
const configSchema = `{
"type": "object",
"description": "Configuration specific to built-in metric-store.",
"properties": {
"checkpoints": {
"description": "Configuration for checkpointing the metrics within metric-store",
"type": "object",
"properties": {
"file-format": {
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
"type": "string"
},
"interval": {
"description": "Interval at which the metrics should be checkpointed.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the checkpointed files should be placed.",
"type": "string"
},
"restore": {
"description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.",
"type": "string"
}
}
},
"archive": {
"description": "Configuration for archiving the already checkpointed files.",
"type": "object",
"properties": {
"interval": {
"description": "Interval at which the checkpointed files should be archived.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the archived files should be placed.",
"type": "string"
}
}
},
"retention-in-memory": {
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
"type": "string"
},
"nats": {
"description": "Configuration for accepting published data through NATS.",
"type": "array",
"items": {
"type": "object",
"properties": {
"address": {
"description": "Address of the NATS server.",
"type": "string"
},
"username": {
"description": "Optional: If configured with username/password method.",
"type": "string"
},
"password": {
"description": "Optional: If configured with username/password method.",
"type": "string"
},
"creds-file-path": {
"description": "Optional: If configured with Credential File method. Path to your NATS cred file.",
"type": "string"
},
"subscriptions": {
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
"type": "array",
"items": {
"type": "object",
"properties": {
"subscribe-to": {
"description": "Channel name",
"type": "string"
},
"cluster-tag": {
"description": "Optional: Allow lines without a cluster tag, use this as default",
"type": "string"
}
}
}
}
}
}
}
}
}`

View File

@@ -1,192 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"sync"
"unsafe"
"github.com/ClusterCockpit/cc-lib/util"
)
// Could also be called "node" as this forms a node in a tree structure.
// Called Level because "node" might be confusing here.
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
// also hold data (in `metrics`).
type Level struct {
children map[string]*Level
metrics []*buffer
lock sync.RWMutex
}
// Find the correct level for the given selector, creating it if
// it does not exist. Example selector in the context of the
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *Level
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok = l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
// The level does not exist, take write lock for unique access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
child = &Level{
metrics: make([]*buffer, nMetrics),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*Level{selector[0]: child}
}
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
func (l *Level) free(t int64) (int, error) {
l.lock.Lock()
defer l.lock.Unlock()
n := 0
for i, b := range l.metrics {
if b != nil {
delme, m := b.free(t)
n += m
if delme {
if cap(b.data) == BufferCap {
bufferPool.Put(b)
}
l.metrics[i] = nil
}
}
}
for _, l := range l.children {
m, err := l.free(t)
n += m
if err != nil {
return n, err
}
}
return n, nil
}
func (l *Level) sizeInBytes() int64 {
l.lock.RLock()
defer l.lock.RUnlock()
size := int64(0)
for _, b := range l.metrics {
if b != nil {
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
}
}
for _, child := range l.children {
size += child.sizeInBytes()
}
return size
}
func (l *Level) findLevel(selector []string) *Level {
if len(selector) == 0 {
return l
}
l.lock.RLock()
defer l.lock.RUnlock()
lvl := l.children[selector[0]]
if lvl == nil {
return nil
}
return lvl.findLevel(selector[1:])
}
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
l.lock.RLock()
defer l.lock.RUnlock()
if len(selector) == 0 {
b := l.metrics[offset]
if b != nil {
return f(b)
}
for _, lvl := range l.children {
err := lvl.findBuffers(nil, offset, f)
if err != nil {
return err
}
}
return nil
}
sel := selector[0]
if len(sel.String) != 0 && l.children != nil {
lvl, ok := l.children[sel.String]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
return nil
}
if sel.Group != nil && l.children != nil {
for _, key := range sel.Group {
lvl, ok := l.children[key]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
}
return nil
}
if sel.Any && l.children != nil {
for _, lvl := range l.children {
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
return err
}
}
return nil
}
return nil
}

View File

@@ -1,429 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package memorystore provides an efficient in-memory time-series metric storage system
// with support for hierarchical data organization, checkpointing, and archiving.
//
// The package organizes metrics in a tree structure (cluster → host → component) and
// provides concurrent read/write access to metric data with configurable aggregation strategies.
// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
// and enforcing retention policies.
//
// Key features:
// - In-memory metric storage with configurable retention
// - Hierarchical data organization (selectors)
// - Concurrent checkpoint/archive workers
// - Support for sum and average aggregation
// - NATS integration for metric ingestion
package memorystore
import (
"bytes"
"context"
"encoding/json"
"errors"
"runtime"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/resampler"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
)
var (
singleton sync.Once
msInstance *MemoryStore
// shutdownFunc stores the context cancellation function created in Init
// and is called during Shutdown to cancel all background goroutines
shutdownFunc context.CancelFunc
)
type Metric struct {
Name string
Value schema.Float
MetricConfig MetricConfig
}
type MemoryStore struct {
Metrics map[string]MetricConfig
root Level
}
func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
startupTime := time.Now()
if rawConfig != nil {
config.Validate(configSchema, rawConfig)
dec := json.NewDecoder(bytes.NewReader(rawConfig))
// dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
}
}
// Set NumWorkers from config or use default
if Keys.NumWorkers <= 0 {
Keys.NumWorkers = min(runtime.NumCPU()/2+1, DefaultMaxWorkers)
}
cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers)
// Helper function to add metric configuration
addMetricConfig := func(mc schema.MetricConfig) {
agg, err := AssignAggregationStrategy(mc.Aggregation)
if err != nil {
cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error())
}
AddMetric(mc.Name, MetricConfig{
Frequency: int64(mc.Timestep),
Aggregation: agg,
})
}
for _, c := range archive.Clusters {
for _, mc := range c.MetricConfig {
addMetricConfig(*mc)
}
for _, sc := range c.SubClusters {
for _, mc := range sc.MetricConfig {
addMetricConfig(mc)
}
}
}
// Pass the config.MetricStoreKeys
InitMetrics(Metrics)
ms := GetMemoryStore()
d, err := time.ParseDuration(Keys.Checkpoints.Restore)
if err != nil {
cclog.Fatal(err)
}
restoreFrom := startupTime.Add(-d)
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
if err != nil {
cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
}
// Try to use less memory by forcing a GC run here and then
// lowering the target percentage. The default of 100 means
// that only once the ratio of new allocations execeds the
// previously active heap, a GC is triggered.
// Forcing a GC here will set the "previously active heap"
// to a minumum.
runtime.GC()
ctx, shutdown := context.WithCancel(context.Background())
wg.Add(4)
Retention(wg, ctx)
Checkpointing(wg, ctx)
Archiving(wg, ctx)
DataStaging(wg, ctx)
// Note: Signal handling has been removed from this function.
// The caller is responsible for handling shutdown signals and calling
// the shutdown() function when appropriate.
// Store the shutdown function for later use by Shutdown()
shutdownFunc = shutdown
err = ReceiveNats(ms, 1, ctx)
if err != nil {
cclog.Fatal(err)
}
}
// InitMetrics creates a new, initialized instance of a MemoryStore.
// Will panic if values in the metric configurations are invalid.
func InitMetrics(metrics map[string]MetricConfig) {
singleton.Do(func() {
offset := 0
for key, cfg := range metrics {
if cfg.Frequency == 0 {
panic("[METRICSTORE]> invalid frequency")
}
metrics[key] = MetricConfig{
Frequency: cfg.Frequency,
Aggregation: cfg.Aggregation,
offset: offset,
}
offset += 1
}
msInstance = &MemoryStore{
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
Metrics: metrics,
}
})
}
func GetMemoryStore() *MemoryStore {
if msInstance == nil {
return nil
}
return msInstance
}
func Shutdown() {
// Check if memorystore was initialized
if msInstance == nil {
cclog.Debug("[METRICSTORE]> MemoryStore not initialized, skipping shutdown")
return
}
// Cancel the context to signal all background goroutines to stop
if shutdownFunc != nil {
shutdownFunc()
}
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
var files int
var err error
ms := GetMemoryStore()
if Keys.Checkpoints.FileFormat == "json" {
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
} else {
files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
close(LineProtocolMessages)
}
if err != nil {
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
}
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
}
func getName(m *MemoryStore, i int) string {
for key, val := range m.Metrics {
if val.offset == i {
return key
}
}
return ""
}
func Retention(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.RetentionInMemory)
if err != nil {
cclog.Fatal(err)
}
if d <= 0 {
return
}
tickInterval := d / 2
if tickInterval <= 0 {
return
}
ticker := time.NewTicker(tickInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
t := time.Now().Add(-d)
cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
freed, err := ms.Free(nil, t.Unix())
if err != nil {
cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
}
}
}
}()
}
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
// Look at `findLevelOrCreate` for how selectors work.
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
var ok bool
for i, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
metric.MetricConfig, ok = m.Metrics[metric.Name]
if !ok {
metric.MetricConfig.Frequency = 0
}
metrics[i] = metric
}
}
return m.WriteToLevel(&m.root, selector, ts, metrics)
}
func (m *MemoryStore) GetLevel(selector []string) *Level {
return m.root.findLevelOrCreate(selector, len(m.Metrics))
}
// WriteToLevel assumes that `minfo` in `metrics` is filled in
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
l = l.findLevelOrCreate(selector, len(m.Metrics))
l.lock.Lock()
defer l.lock.Unlock()
for _, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
continue
}
b := l.metrics[metric.MetricConfig.offset]
if b == nil {
// First write to this metric and level
b = newBuffer(ts, metric.MetricConfig.Frequency)
l.metrics[metric.MetricConfig.offset] = b
}
nb, err := b.write(ts, metric.Value)
if err != nil {
return err
}
// Last write created a new buffer...
if b != nb {
l.metrics[metric.MetricConfig.offset] = nb
}
}
return nil
}
// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
// The second and third return value are the actual from/to for the data. Those can be different from
// the range asked for if no data was available.
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
if from > to {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
}
minfo, ok := m.Metrics[metric]
if !ok {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> unknown metric: " + metric)
}
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
cdata, cfrom, cto, err := b.read(from, to, data)
if err != nil {
return err
}
if n == 0 {
from, to = cfrom, cto
} else if from != cfrom || to != cto || len(data) != len(cdata) {
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
if missingfront != 0 {
return ErrDataDoesNotAlign
}
newlen := len(cdata) - missingback
if newlen < 1 {
return ErrDataDoesNotAlign
}
cdata = cdata[0:newlen]
if len(cdata) != len(data) {
return ErrDataDoesNotAlign
}
from, to = cfrom, cto
}
data = cdata
n += 1
return nil
})
if err != nil {
return nil, 0, 0, 0, err
} else if n == 0 {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
} else if n > 1 {
if minfo.Aggregation == AvgAggregation {
normalize := 1. / schema.Float(n)
for i := 0; i < len(data); i++ {
data[i] *= normalize
}
} else if minfo.Aggregation != SumAggregation {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
}
}
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
if err != nil {
return nil, 0, 0, 0, err
}
return data, from, to, resolution, nil
}
// Free releases all buffers for the selected level and all its children that
// contain only values older than `t`.
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
return m.GetLevel(selector).free(t)
}
func (m *MemoryStore) FreeAll() error {
for k := range m.root.children {
delete(m.root.children, k)
}
return nil
}
func (m *MemoryStore) SizeInBytes() int64 {
return m.root.sizeInBytes()
}
// ListChildren , given a selector, returns a list of all children of the level
// selected.
func (m *MemoryStore) ListChildren(selector []string) []string {
lvl := &m.root
for lvl != nil && len(selector) != 0 {
lvl.lock.RLock()
next := lvl.children[selector[0]]
lvl.lock.RUnlock()
lvl = next
selector = selector[1:]
}
if lvl == nil {
return nil
}
lvl.lock.RLock()
defer lvl.lock.RUnlock()
children := make([]string, 0, len(lvl.children))
for child := range lvl.children {
children = append(children, child)
}
return children
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,127 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
)
type MetricDataRepository interface {
// Initialize this MetricDataRepository. One instance of
// this interface will only ever be responsible for one cluster.
Init(rawConfig json.RawMessage) error
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
LoadScopedStats(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.ScopedJobStats, error)
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
LoadNodeListData(cluster, subCluster string, nodes, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, ctx context.Context) (map[string]schema.JobData, error)
}
var upstreamMetricDataRepo MetricDataRepository
// func Init() error {
// for _, cluster := range config.Clusters {
// if cluster.MetricDataRepository != nil {
// var kind struct {
// Kind string `json:"kind"`
// }
// if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
// cclog.Warn("Error while unmarshaling raw json MetricDataRepository")
// return err
// }
//
// var mdr MetricDataRepository
// switch kind.Kind {
// case "cc-metric-store":
// mdr = &CCMetricStore{}
// case "prometheus":
// mdr = &PrometheusDataRepository{}
// case "test":
// mdr = &TestMetricDataRepository{}
// default:
// return fmt.Errorf("METRICDATA/METRICDATA > Unknown MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
// }
//
// if err := mdr.Init(cluster.MetricDataRepository); err != nil {
// cclog.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
// return err
// }
// metricDataRepos[cluster.Name] = mdr
// }
// }
// return nil
// }
// func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
// var err error
// repo, ok := metricDataRepos[cluster]
//
// if !ok {
// err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
// }
//
// return repo, err
// }
// InitUpstreamRepos initializes global upstream metric data repository for the pull worker
func InitUpstreamRepos() error {
if config.Keys.UpstreamMetricRepository == nil {
return nil
}
var kind struct {
Kind string `json:"kind"`
}
if err := json.Unmarshal(*config.Keys.UpstreamMetricRepository, &kind); err != nil {
cclog.Warn("Error while unmarshaling raw json UpstreamMetricRepository")
return err
}
var mdr MetricDataRepository
switch kind.Kind {
case "cc-metric-store":
mdr = &CCMetricStore{}
case "prometheus":
mdr = &PrometheusDataRepository{}
case "test":
mdr = &TestMetricDataRepository{}
default:
return fmt.Errorf("METRICDATA/METRICDATA > Unknown UpstreamMetricRepository %v", kind.Kind)
}
if err := mdr.Init(*config.Keys.UpstreamMetricRepository); err != nil {
cclog.Errorf("Error initializing UpstreamMetricRepository %v", kind.Kind)
return err
}
upstreamMetricDataRepo = mdr
cclog.Infof("Initialized global upstream metric repository '%s'", kind.Kind)
return nil
}
// GetUpstreamMetricDataRepo returns the global upstream metric data repository
func GetUpstreamMetricDataRepo() (MetricDataRepository, error) {
if upstreamMetricDataRepo == nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > no upstream metric data repository configured")
}
return upstreamMetricDataRepo, nil
}

View File

@@ -1,588 +0,0 @@
// Copyright (C) 2022 DKRZ
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"math"
"net/http"
"os"
"regexp"
"sort"
"strings"
"sync"
"text/template"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
promapi "github.com/prometheus/client_golang/api"
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
promcfg "github.com/prometheus/common/config"
promm "github.com/prometheus/common/model"
)
type PrometheusDataRepositoryConfig struct {
Url string `json:"url"`
Username string `json:"username,omitempty"`
Suffix string `json:"suffix,omitempty"`
Templates map[string]string `json:"query-templates"`
}
type PrometheusDataRepository struct {
client promapi.Client
queryClient promv1.API
suffix string
templates map[string]*template.Template
}
type PromQLArgs struct {
Nodes string
}
type Trie map[rune]Trie
var logOnce sync.Once
func contains(s []schema.MetricScope, str schema.MetricScope) bool {
for _, v := range s {
if v == str {
return true
}
}
return false
}
func MinMaxMean(data []schema.Float) (float64, float64, float64) {
if len(data) == 0 {
return 0.0, 0.0, 0.0
}
min := math.MaxFloat64
max := -math.MaxFloat64
var sum float64
var n float64
for _, val := range data {
if val.IsNaN() {
continue
}
sum += float64(val)
n += 1
if float64(val) > max {
max = float64(val)
}
if float64(val) < min {
min = float64(val)
}
}
return min, max, sum / n
}
// Rewritten from
// https://github.com/ermanh/trieregex/blob/master/trieregex/trieregex.py
func nodeRegex(nodes []string) string {
root := Trie{}
// add runes of each compute node to trie
for _, node := range nodes {
_trie := root
for _, c := range node {
if _, ok := _trie[c]; !ok {
_trie[c] = Trie{}
}
_trie = _trie[c]
}
_trie['*'] = Trie{}
}
// recursively build regex from rune trie
var trieRegex func(trie Trie, reset bool) string
trieRegex = func(trie Trie, reset bool) string {
if reset == true {
trie = root
}
if len(trie) == 0 {
return ""
}
if len(trie) == 1 {
for key, _trie := range trie {
if key == '*' {
return ""
}
return regexp.QuoteMeta(string(key)) + trieRegex(_trie, false)
}
} else {
sequences := []string{}
for key, _trie := range trie {
if key != '*' {
sequences = append(sequences, regexp.QuoteMeta(string(key))+trieRegex(_trie, false))
}
}
sort.Slice(sequences, func(i, j int) bool {
return (-len(sequences[i]) < -len(sequences[j])) || (sequences[i] < sequences[j])
})
var result string
// single edge from this tree node
if len(sequences) == 1 {
result = sequences[0]
if len(result) > 1 {
result = "(?:" + result + ")"
}
// multiple edges, each length 1
} else if s := strings.Join(sequences, ""); len(s) == len(sequences) {
// char or numeric range
if len(s)-1 == int(s[len(s)-1])-int(s[0]) {
result = fmt.Sprintf("[%c-%c]", s[0], s[len(s)-1])
// char or numeric set
} else {
result = "[" + s + "]"
}
// multiple edges of different lengths
} else {
result = "(?:" + strings.Join(sequences, "|") + ")"
}
if _, ok := trie['*']; ok {
result += "?"
}
return result
}
return ""
}
return trieRegex(root, true)
}
func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
var config PrometheusDataRepositoryConfig
// parse config
if err := json.Unmarshal(rawConfig, &config); err != nil {
cclog.Warn("Error while unmarshaling raw json config")
return err
}
// support basic authentication
var rt http.RoundTripper = nil
if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
prom_pw := promcfg.Secret(prom_pw)
rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
} else {
if config.Username != "" {
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
}
}
// init client
client, err := promapi.NewClient(promapi.Config{
Address: config.Url,
RoundTripper: rt,
})
if err != nil {
cclog.Error("Error while initializing new prometheus client")
return err
}
// init query client
pdb.client = client
pdb.queryClient = promv1.NewAPI(pdb.client)
// site config
pdb.suffix = config.Suffix
// init query templates
pdb.templates = make(map[string]*template.Template)
for metric, templ := range config.Templates {
pdb.templates[metric], err = template.New(metric).Parse(templ)
if err == nil {
cclog.Debugf("Added PromQL template for %s: %s", metric, templ)
} else {
cclog.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
}
}
return nil
}
// TODO: respect scope argument
func (pdb *PrometheusDataRepository) FormatQuery(
metric string,
scope schema.MetricScope,
nodes []string,
cluster string,
) (string, error) {
args := PromQLArgs{}
if len(nodes) > 0 {
args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
} else {
args.Nodes = fmt.Sprintf(".*%s", pdb.suffix)
}
buf := &bytes.Buffer{}
if templ, ok := pdb.templates[metric]; ok {
err := templ.Execute(buf, args)
if err != nil {
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > Error compiling template %v", templ))
} else {
query := buf.String()
cclog.Debugf("PromQL: %s", query)
return query, nil
}
} else {
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > No PromQL for metric %s configured.", metric))
}
}
// Convert PromAPI row to CC schema.Series
func (pdb *PrometheusDataRepository) RowToSeries(
from time.Time,
step int64,
steps int64,
row *promm.SampleStream,
) schema.Series {
ts := from.Unix()
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
// init array of expected length with NaN
values := make([]schema.Float, steps+1)
for i := range values {
values[i] = schema.NaN
}
// copy recorded values from prom sample pair
for _, v := range row.Values {
idx := (v.Timestamp.Unix() - ts) / step
values[idx] = schema.Float(v.Value)
}
min, max, mean := MinMaxMean(values)
// output struct
return schema.Series{
Hostname: hostname,
Data: values,
Statistics: schema.MetricStatistics{
Avg: mean,
Min: min,
Max: max,
},
}
}
func (pdb *PrometheusDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
// TODO respect requested scope
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
jobData := make(schema.JobData)
// parse job specs
nodes := make([]string, len(job.Resources))
for i, resource := range job.Resources {
nodes[i] = resource.Hostname
}
from := time.Unix(job.StartTime, 0)
to := time.Unix(job.StartTime+int64(job.Duration), 0)
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(job.Cluster, metric)
if metricConfig == nil {
cclog.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, job.Cluster)
if err != nil {
cclog.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all job nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
cclog.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
cclog.Warnf("Warnings: %v\n", warnings)
}
// init data structures
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
jobMetric, ok := jobData[metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: make([]schema.Series, 0),
}
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
jobMetric.Series = append(jobMetric.Series,
pdb.RowToSeries(from, step, steps, row))
}
// only add metric if at least one host returned data
if !ok && len(jobMetric.Series) > 0 {
jobData[metric][scope] = jobMetric
}
// sort by hostname to get uniform coloring
sort.Slice(jobMetric.Series, func(i, j int) bool {
return (jobMetric.Series[i].Hostname < jobMetric.Series[j].Hostname)
})
}
}
return jobData, nil
}
// TODO change implementation to precomputed/cached stats
func (pdb *PrometheusDataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
// map of metrics of nodes of stats
stats := map[string]map[string]schema.MetricStatistics{}
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
cclog.Warn("Error while loading job for stats")
return nil, err
}
for metric, metricData := range data {
stats[metric] = make(map[string]schema.MetricStatistics)
for _, series := range metricData[schema.MetricScopeNode].Series {
stats[metric][series.Hostname] = series.Statistics
}
}
return stats, nil
}
func (pdb *PrometheusDataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
t0 := time.Now()
// Map of hosts of metrics of value slices
data := make(map[string]map[string][]*schema.JobMetric)
// query db for each metric
// TODO: scopes seems to be always empty
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(cluster, metric)
if metricConfig == nil {
cclog.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
if err != nil {
cclog.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
cclog.Warnf("Warnings: %v\n", warnings)
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
hostdata, ok := data[hostname]
if !ok {
hostdata = make(map[string][]*schema.JobMetric)
data[hostname] = hostdata
}
// output per host and metric
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
},
)
}
}
}
t1 := time.Since(t0)
cclog.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
return data, nil
}
// Implemented by NHR@FAU; Used in Job-View StatsTable
func (pdb *PrometheusDataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable
scopedJobStats := make(schema.ScopedJobStats)
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
cclog.Warn("Error while loading job for scopedJobStats")
return nil, err
}
for metric, metricData := range data {
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for _, series := range metricData[scope].Series {
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: series.Hostname,
Data: &series.Statistics,
})
}
}
}
return scopedJobStats, nil
}
// Implemented by NHR@FAU; Used in NodeList-View
func (pdb *PrometheusDataRepository) LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList
// Fetch Data, based on pdb.LoadNodeData()
t0 := time.Now()
// Map of hosts of jobData
data := make(map[string]schema.JobData)
// query db for each metric
// TODO: scopes seems to be always empty
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(cluster, metric)
if metricConfig == nil {
cclog.Warnf("Error in LoadNodeListData: Metric %s for cluster %s not configured", metric, cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
if err != nil {
cclog.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
cclog.Warnf("Warnings: %v\n", warnings)
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
hostdata, ok := data[hostname]
if !ok {
hostdata = make(schema.JobData)
data[hostname] = hostdata
}
metricdata, ok := hostdata[metric]
if !ok {
metricdata = make(map[schema.MetricScope]*schema.JobMetric)
data[hostname][metric] = metricdata
}
// output per host, metric and scope
scopeData, ok := metricdata[scope]
if !ok {
scopeData = &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
}
data[hostname][metric][scope] = scopeData
}
}
}
}
t1 := time.Since(t0)
cclog.Debugf("LoadNodeListData of %v nodes took %s", len(data), t1)
return data, nil
}

View File

@@ -1,74 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"encoding/json"
"time"
"github.com/ClusterCockpit/cc-lib/schema"
)
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
panic("TODO")
}
// TestMetricDataRepository is only a mock for unit-testing.
type TestMetricDataRepository struct{}
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
return nil
}
func (tmdr *TestMetricDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
}
func (tmdr *TestMetricDataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
panic("TODO")
}

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Package metricdispatcher provides a unified interface for loading and caching job metric data. // Package metricdispatch provides a unified interface for loading and caching job metric data.
// //
// This package serves as a central dispatcher that routes metric data requests to the appropriate // This package serves as a central dispatcher that routes metric data requests to the appropriate
// backend based on job state. For running jobs, data is fetched from the metric store (e.g., cc-metric-store). // backend based on job state. For running jobs, data is fetched from the metric store (e.g., cc-metric-store).
@@ -29,13 +29,13 @@
// //
// The primary entry point is LoadData, which automatically handles both running and archived jobs: // The primary entry point is LoadData, which automatically handles both running and archived jobs:
// //
// jobData, err := metricdispatcher.LoadData(job, metrics, scopes, ctx, resolution) // jobData, err := metricdispatch.LoadData(job, metrics, scopes, ctx, resolution)
// if err != nil { // if err != nil {
// // Handle error // // Handle error
// } // }
// //
// For statistics only, use LoadJobStats, LoadScopedJobStats, or LoadAverages depending on the required format. // For statistics only, use LoadJobStats, LoadScopedJobStats, or LoadAverages depending on the required format.
package metricdispatcher package metricdispatch
import ( import (
"context" "context"
@@ -44,12 +44,12 @@ import (
"time" "time"
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/metricstore"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/lrucache" "github.com/ClusterCockpit/cc-lib/v2/lrucache"
"github.com/ClusterCockpit/cc-lib/resampler" "github.com/ClusterCockpit/cc-lib/v2/resampler"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
// cache is an LRU cache with 128 MB capacity for storing loaded job metric data. // cache is an LRU cache with 128 MB capacity for storing loaded job metric data.
@@ -109,7 +109,7 @@ func LoadData(job *schema.Job,
} }
} }
jd, err = memorystore.LoadData(job, metrics, scopes, ctx, resolution) jd, err = metricstore.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil { if err != nil {
if len(jd) != 0 { if len(jd) != 0 {
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s", cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s",
@@ -238,7 +238,7 @@ func LoadAverages(
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here? return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
} }
stats, err := memorystore.LoadStats(job, metrics, ctx) stats, err := metricstore.LoadStats(job, metrics, ctx)
if err != nil { if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, err.Error())
@@ -275,7 +275,7 @@ func LoadScopedJobStats(
return archive.LoadScopedStatsFromArchive(job, metrics, scopes) return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
} }
scopedStats, err := memorystore.LoadScopedStats(job, metrics, scopes, ctx) scopedStats, err := metricstore.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil { if err != nil {
cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, err.Error())
@@ -299,7 +299,7 @@ func LoadJobStats(
data := make(map[string]schema.MetricStatistics, len(metrics)) data := make(map[string]schema.MetricStatistics, len(metrics))
stats, err := memorystore.LoadStats(job, metrics, ctx) stats, err := metricstore.LoadStats(job, metrics, ctx)
if err != nil { if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, err.Error())
@@ -348,7 +348,7 @@ func LoadNodeData(
} }
} }
data, err := memorystore.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) data, err := metricstore.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil { if err != nil {
if len(data) != 0 { if len(data) != 0 {
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error()) cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
@@ -385,7 +385,7 @@ func LoadNodeListData(
} }
} }
data, err := memorystore.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx) data, err := metricstore.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
if err != nil { if err != nil {
if len(data) != 0 { if len(data) != 0 {
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s", cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",

View File

@@ -3,12 +3,12 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package metricdispatcher package metricdispatch
import ( import (
"testing" "testing"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
func TestDeepCopy(t *testing.T) { func TestDeepCopy(t *testing.T) {

View File

@@ -3,22 +3,32 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore // This file contains the API types and data fetching logic for querying metric data
// from the in-memory metric store. It provides structures for building complex queries
// with support for aggregation, scaling, padding, and statistics computation.
package metricstore
import ( import (
"errors" "errors"
"fmt" "fmt"
"math" "math"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/util" "github.com/ClusterCockpit/cc-lib/v2/util"
) )
var ( var (
// ErrInvalidTimeRange is returned when a query has 'from' >= 'to'
ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'") ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'")
// ErrEmptyCluster is returned when a query with ForAllNodes has no cluster specified
ErrEmptyCluster = errors.New("[METRICSTORE]> cluster name cannot be empty") ErrEmptyCluster = errors.New("[METRICSTORE]> cluster name cannot be empty")
) )
// APIMetricData represents the response data for a single metric query.
//
// It contains both the time-series data points and computed statistics (avg, min, max).
// If an error occurred during data retrieval, the Error field will be set and other
// fields may be incomplete.
type APIMetricData struct { type APIMetricData struct {
Error *string `json:"error,omitempty"` Error *string `json:"error,omitempty"`
Data schema.FloatArray `json:"data,omitempty"` Data schema.FloatArray `json:"data,omitempty"`
@@ -30,6 +40,13 @@ type APIMetricData struct {
Max schema.Float `json:"max"` Max schema.Float `json:"max"`
} }
// APIQueryRequest represents a batch query request for metric data.
//
// It supports two modes of operation:
// 1. Explicit queries via the Queries field
// 2. Automatic query generation via ForAllNodes (queries all specified metrics for all nodes in the cluster)
//
// The request can be customized with flags to include/exclude statistics, raw data, and padding.
type APIQueryRequest struct { type APIQueryRequest struct {
Cluster string `json:"cluster"` Cluster string `json:"cluster"`
Queries []APIQuery `json:"queries"` Queries []APIQuery `json:"queries"`
@@ -41,11 +58,25 @@ type APIQueryRequest struct {
WithPadding bool `json:"with-padding"` WithPadding bool `json:"with-padding"`
} }
// APIQueryResponse represents the response to an APIQueryRequest.
//
// Results is a 2D array where each outer element corresponds to a query,
// and each inner element corresponds to a selector within that query
// (e.g., multiple CPUs or cores).
type APIQueryResponse struct { type APIQueryResponse struct {
Queries []APIQuery `json:"queries,omitempty"` Queries []APIQuery `json:"queries,omitempty"`
Results [][]APIMetricData `json:"results"` Results [][]APIMetricData `json:"results"`
} }
// APIQuery represents a single metric query with optional hierarchical selectors.
//
// The hierarchical selection works as follows:
// - Hostname: The node to query
// - Type + TypeIds: First level of hierarchy (e.g., "cpu" + ["0", "1", "2"])
// - SubType + SubTypeIds: Second level of hierarchy (e.g., "core" + ["0", "1"])
//
// If Aggregate is true, data from multiple type/subtype IDs will be aggregated according
// to the metric's aggregation strategy. Otherwise, separate results are returned for each combination.
type APIQuery struct { type APIQuery struct {
Type *string `json:"type,omitempty"` Type *string `json:"type,omitempty"`
SubType *string `json:"subtype,omitempty"` SubType *string `json:"subtype,omitempty"`
@@ -58,6 +89,11 @@ type APIQuery struct {
Aggregate bool `json:"aggreg"` Aggregate bool `json:"aggreg"`
} }
// AddStats computes and populates the Avg, Min, and Max fields from the Data array.
//
// NaN values in the data are ignored during computation. If all values are NaN,
// the statistics fields will be set to NaN.
//
// TODO: Optimize this, just like the stats endpoint! // TODO: Optimize this, just like the stats endpoint!
func (data *APIMetricData) AddStats() { func (data *APIMetricData) AddStats() {
n := 0 n := 0
@@ -83,6 +119,10 @@ func (data *APIMetricData) AddStats() {
} }
} }
// ScaleBy multiplies all data points and statistics by the given factor.
//
// This is commonly used for unit conversion (e.g., bytes to gigabytes).
// Scaling by 0 or 1 is a no-op for performance reasons.
func (data *APIMetricData) ScaleBy(f schema.Float) { func (data *APIMetricData) ScaleBy(f schema.Float) {
if f == 0 || f == 1 { if f == 0 || f == 1 {
return return
@@ -96,6 +136,17 @@ func (data *APIMetricData) ScaleBy(f schema.Float) {
} }
} }
// PadDataWithNull pads the beginning of the data array with NaN values if needed.
//
// This ensures that the data aligns with the requested 'from' timestamp, even if
// the metric store doesn't have data for the earliest time points. This is useful
// for maintaining consistent array indexing across multiple queries.
//
// Parameters:
// - ms: MemoryStore instance to lookup metric configuration
// - from: The requested start timestamp
// - to: The requested end timestamp (unused but kept for API consistency)
// - metric: The metric name to lookup frequency information
func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) { func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) {
minfo, ok := ms.Metrics[metric] minfo, ok := ms.Metrics[metric]
if !ok { if !ok {
@@ -115,6 +166,31 @@ func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metr
} }
} }
// FetchData executes a batch metric query request and returns the results.
//
// This is the primary API for retrieving metric data from the memory store. It supports:
// - Individual queries via req.Queries
// - Batch queries for all nodes via req.ForAllNodes
// - Hierarchical selector construction (cluster → host → type → subtype)
// - Optional statistics computation (avg, min, max)
// - Optional data scaling
// - Optional data padding with NaN values
//
// The function constructs selectors based on the query parameters and calls MemoryStore.Read()
// for each selector. If a query specifies Aggregate=false with multiple type/subtype IDs,
// separate results are returned for each combination.
//
// Parameters:
// - req: The query request containing queries, time range, and options
//
// Returns:
// - APIQueryResponse containing results for each query, or error if validation fails
//
// Errors:
// - ErrInvalidTimeRange if req.From > req.To
// - ErrEmptyCluster if req.ForAllNodes is used without specifying a cluster
// - Error if MemoryStore is not initialized
// - Individual query errors are stored in APIMetricData.Error field
func FetchData(req APIQueryRequest) (*APIQueryResponse, error) { func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
if req.From > req.To { if req.From > req.To {
return nil, ErrInvalidTimeRange return nil, ErrInvalidTimeRange
@@ -126,10 +202,9 @@ func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
req.WithData = true req.WithData = true
ms := GetMemoryStore() ms := GetMemoryStore()
if ms == nil { if ms == nil {
return nil, fmt.Errorf("memorystore not initialized") return nil, fmt.Errorf("[METRICSTORE]> memorystore not initialized")
} }
response := APIQueryResponse{ response := APIQueryResponse{
Results: make([][]APIMetricData, 0, len(req.Queries)), Results: make([][]APIMetricData, 0, len(req.Queries)),
} }
@@ -196,8 +271,6 @@ func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
} }
} }
// log.Printf("query: %#v\n", query)
// log.Printf("sels: %#v\n", sels)
var err error var err error
res := make([]APIMetricData, 0, len(sels)) res := make([]APIMetricData, 0, len(sels))
for _, sel := range sels { for _, sel := range sels {

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore package metricstore
import ( import (
"archive/zip" "archive/zip"
@@ -18,15 +18,39 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
) )
// Worker for either Archiving or Deleting files
func Archiving(wg *sync.WaitGroup, ctx context.Context) { func Archiving(wg *sync.WaitGroup, ctx context.Context) {
if Keys.Archive != nil {
// Run as Archiver
runWorker(wg, ctx,
Keys.Archive.ArchiveInterval,
"archiving",
Keys.Archive.RootDir,
Keys.Archive.DeleteInstead,
)
} else {
// Run as Deleter
runWorker(wg, ctx,
Keys.RetentionInMemory,
"deleting",
"",
true,
)
}
}
// runWorker takes simple values to configure what it does
func runWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, archiveDir string, delete bool) {
go func() { go func() {
defer wg.Done() defer wg.Done()
d, err := time.ParseDuration(Keys.Archive.Interval)
d, err := time.ParseDuration(interval)
if err != nil { if err != nil {
cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err) cclog.Fatalf("[METRICSTORE]> error parsing %s interval duration: %v\n", mode, err)
} }
if d <= 0 { if d <= 0 {
return return
@@ -41,17 +65,21 @@ func Archiving(wg *sync.WaitGroup, ctx context.Context) {
return return
case <-ticker.C: case <-ticker.C:
t := time.Now().Add(-d) t := time.Now().Add(-d)
cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...", t.Format(time.RFC3339)) cclog.Infof("[METRICSTORE]> start %s checkpoints (older than %s)...", mode, t.Format(time.RFC3339))
n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir,
Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead) n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir, archiveDir, t.Unix(), delete)
if err != nil { if err != nil {
cclog.Errorf("[METRICSTORE]> archiving failed: %s", err.Error()) cclog.Errorf("[METRICSTORE]> %s failed: %s", mode, err.Error())
} else {
if delete && archiveDir == "" {
cclog.Infof("[METRICSTORE]> done: %d checkpoints deleted", n)
} else { } else {
cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n) cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
} }
} }
} }
}
}() }()
} }

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore package metricstore
import ( import (
"bufio" "bufio"
@@ -19,13 +19,15 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/linkedin/goavro/v2" "github.com/linkedin/goavro/v2"
) )
var NumAvroWorkers int = DefaultAvroWorkers var (
var startUp bool = true NumAvroWorkers int = DefaultAvroWorkers
startUp bool = true
)
func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) { func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
levels := make([]*AvroLevel, 0) levels := make([]*AvroLevel, 0)

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore package metricstore
import ( import (
"context" "context"
@@ -11,7 +11,7 @@ import (
"strconv" "strconv"
"sync" "sync"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
) )
func DataStaging(wg *sync.WaitGroup, ctx context.Context) { func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
@@ -30,8 +30,51 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
// Drain any remaining messages in channel before exiting
for {
select {
case val, ok := <-LineProtocolMessages:
if !ok {
// Channel closed
return return
case val := <-LineProtocolMessages: }
// Process remaining message
freq, err := GetMetricFrequency(val.MetricName)
if err != nil {
continue
}
metricName := ""
for _, selectorName := range val.Selector {
metricName += selectorName + SelectorDelimiter
}
metricName += val.MetricName
var selector []string
selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
if !stringSlicesEqual(oldSelector, selector) {
avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
if avroLevel == nil {
cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
}
oldSelector = slices.Clone(selector)
}
if avroLevel != nil {
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
}
default:
// No more messages, exit
return
}
}
case val, ok := <-LineProtocolMessages:
if !ok {
// Channel closed, exit gracefully
return
}
// Fetch the frequency of the metric from the global configuration // Fetch the frequency of the metric from the global configuration
freq, err := GetMetricFrequency(val.MetricName) freq, err := GetMetricFrequency(val.MetricName)
if err != nil { if err != nil {
@@ -65,9 +108,11 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
oldSelector = slices.Clone(selector) oldSelector = slices.Clone(selector)
} }
if avroLevel != nil {
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq)) avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
} }
} }
}
}() }()
} }

View File

@@ -3,12 +3,12 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore package metricstore
import ( import (
"sync" "sync"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
var ( var (

View File

@@ -0,0 +1,322 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstore provides buffer.go: Time-series data buffer implementation.
//
// # Buffer Architecture
//
// Each metric at each hierarchical level (cluster/host/cpu/etc.) uses a linked-list
// chain of fixed-size buffers to store time-series data. This design:
//
// - Avoids reallocation/copying when growing (new links added instead)
// - Enables efficient pooling (buffers returned to sync.Pool)
// - Supports traversal back in time (via prev pointers)
// - Maintains temporal ordering (newer data in later buffers)
//
// # Buffer Chain Example
//
// [oldest buffer] <- prev -- [older] <- prev -- [newest buffer (head)]
// start=1000 start=1512 start=2024
// data=[v0...v511] data=[v0...v511] data=[v0...v42]
//
// When the head buffer reaches capacity (BufferCap = 512), a new buffer becomes
// the new head and the old head is linked via prev.
//
// # Pooling Strategy
//
// sync.Pool reduces GC pressure for the common case (BufferCap-sized allocations).
// Non-standard capacity buffers are not pooled (e.g., from checkpoint deserialization).
//
// # Time Alignment
//
// Timestamps are aligned to measurement frequency intervals:
//
// index = (timestamp - buffer.start) / buffer.frequency
// actualTime = buffer.start + (frequency / 2) + (index * frequency)
//
// Missing data points are represented as NaN values. The read() function performs
// linear interpolation where possible.
package metricstore
import (
"errors"
"sync"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// BufferCap is the default buffer capacity.
// buffer.data will only ever grow up to its capacity and a new link
// in the buffer chain will be created if needed so that no copying
// of data or reallocation needs to happen on writes.
const BufferCap int = DefaultBufferCapacity
var bufferPool sync.Pool = sync.Pool{
New: func() any {
return &buffer{
data: make([]schema.Float, 0, BufferCap),
}
},
}
var (
// ErrNoData indicates no time-series data exists for the requested metric/level.
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
// ErrDataDoesNotAlign indicates that aggregated data from child scopes
// does not align with the parent scope's expected timestamps/intervals.
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
)
// buffer stores time-series data for a single metric at a specific hierarchical level.
//
// Buffers form doubly-linked chains ordered by time. When capacity is reached,
// a new buffer becomes the head and the old head is linked via prev/next.
//
// Fields:
// - prev: Link to older buffer in the chain (nil if this is oldest)
// - next: Link to newer buffer in the chain (nil if this is newest/head)
// - data: Time-series values (schema.Float supports NaN for missing data)
// - frequency: Measurement interval in seconds
// - start: Start timestamp (adjusted by -frequency/2 for alignment)
// - archived: True if data has been persisted to disk archive
// - closed: True if buffer is no longer accepting writes
//
// Index calculation: index = (timestamp - start) / frequency
// Actual data timestamp: start + (frequency / 2) + (index * frequency)
type buffer struct {
prev *buffer
next *buffer
data []schema.Float
frequency int64
start int64
archived bool
closed bool
}
func newBuffer(ts, freq int64) *buffer {
b := bufferPool.Get().(*buffer)
b.frequency = freq
b.start = ts - (freq / 2)
b.prev = nil
b.next = nil
b.archived = false
b.closed = false
b.data = b.data[:0]
return b
}
// write appends a timestamped value to the buffer chain.
//
// Returns the head buffer (which may be newly created if capacity was reached).
// Timestamps older than the buffer's start are rejected. If the calculated index
// exceeds capacity, a new buffer is allocated and linked as the new head.
//
// Missing timestamps are automatically filled with NaN values to maintain alignment.
// Overwrites are allowed if the index is already within the existing data slice.
//
// Parameters:
// - ts: Unix timestamp in seconds
// - value: Metric value (can be schema.NaN for missing data)
//
// Returns:
// - *buffer: The new head buffer (same as b if no new buffer created)
// - error: Non-nil if timestamp is before buffer start
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
if ts < b.start {
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
}
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
idx := int((ts - b.start) / b.frequency)
if idx >= cap(b.data) {
newbuf := newBuffer(ts, b.frequency)
newbuf.prev = b
b.next = newbuf
b = newbuf
idx = 0
}
// Overwriting value or writing value from past
if idx < len(b.data) {
b.data[idx] = value
return b, nil
}
// Fill up unwritten slots with NaN
for i := len(b.data); i < idx; i++ {
b.data = append(b.data, schema.NaN)
}
b.data = append(b.data, value)
return b, nil
}
func (b *buffer) end() int64 {
return b.firstWrite() + int64(len(b.data))*b.frequency
}
func (b *buffer) firstWrite() int64 {
return b.start + (b.frequency / 2)
}
// read retrieves time-series data from the buffer chain for the specified time range.
//
// Traverses the buffer chain backwards (via prev links) if 'from' precedes the current
// buffer's start. Missing data points are represented as NaN. Values are accumulated
// into the provided 'data' slice (using +=, so caller must zero-initialize if needed).
//
// The function adjusts the actual time range returned if data is unavailable at the
// boundaries (returned via adjusted from/to timestamps).
//
// Parameters:
// - from: Start timestamp (Unix seconds)
// - to: End timestamp (Unix seconds, exclusive)
// - data: Pre-allocated slice to accumulate results (must be large enough)
//
// Returns:
// - []schema.Float: Slice of data (may be shorter than input 'data' slice)
// - int64: Actual start timestamp with available data
// - int64: Actual end timestamp (exclusive)
// - error: Non-nil on failure
//
// Panics if 'data' slice is too small to hold all values in [from, to).
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
if from < b.firstWrite() {
if b.prev != nil {
return b.prev.read(from, to, data)
}
from = b.firstWrite()
}
i := 0
t := from
for ; t < to; t += b.frequency {
idx := int((t - b.start) / b.frequency)
if idx >= cap(b.data) {
if b.next == nil {
break
}
b = b.next
idx = 0
}
if idx >= len(b.data) {
if b.next == nil || to <= b.next.start {
break
}
data[i] += schema.NaN
} else if t < b.start {
data[i] += schema.NaN
} else {
data[i] += b.data[idx]
}
i++
}
return data[:i], from, t, nil
}
// free removes buffers older than the specified timestamp from the chain.
//
// Recursively traverses backwards (via prev) and unlinks buffers whose end time
// is before the retention threshold. Freed buffers are returned to the pool if
// they have the standard capacity (BufferCap).
//
// Parameters:
// - t: Retention threshold timestamp (Unix seconds)
//
// Returns:
// - delme: True if the current buffer itself should be deleted by caller
// - n: Number of buffers freed in this subtree
func (b *buffer) free(t int64) (delme bool, n int) {
if b.prev != nil {
delme, m := b.prev.free(t)
n += m
if delme {
b.prev.next = nil
if cap(b.prev.data) == BufferCap {
bufferPool.Put(b.prev)
}
b.prev = nil
}
}
end := b.end()
if end < t {
return true, n + 1
}
return false, n
}
// forceFreeOldest recursively finds the end of the linked list (the oldest buffer)
// and removes it.
// Returns:
//
// delme: true if 'b' itself is the oldest and should be removed by the caller
// n: the number of buffers freed (will be 1 or 0)
func (b *buffer) forceFreeOldest() (delme bool, n int) {
// If there is a previous buffer, recurse down to find the oldest
if b.prev != nil {
delPrev, freed := b.prev.forceFreeOldest()
// If the previous buffer signals it should be deleted:
if delPrev {
// Unlink references
b.prev.next = nil
// Return to pool if capacity matches
if cap(b.prev.data) == BufferCap {
bufferPool.Put(b.prev)
}
// Remove the link from the current buffer
b.prev = nil
}
return false, freed
}
// If b.prev is nil, THIS buffer is the oldest.
// We return true so the parent (or the Level loop) knows to delete reference to 'b'.
return true, 1
}
// iterFromTo invokes callback on every buffer in the chain that overlaps [from, to].
//
// Traverses backwards (via prev) first, then processes current buffer if it overlaps
// the time range. Used for checkpoint/archive operations that need to serialize buffers
// within a specific time window.
//
// Parameters:
// - from: Start timestamp (Unix seconds, inclusive)
// - to: End timestamp (Unix seconds, inclusive)
// - callback: Function to invoke on each overlapping buffer
//
// Returns:
// - error: First error returned by callback, or nil if all succeeded
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
if b == nil {
return nil
}
if err := b.prev.iterFromTo(from, to, callback); err != nil {
return err
}
if from <= b.end() && b.start <= to {
return callback(b)
}
return nil
}
func (b *buffer) count() int64 {
res := int64(len(b.data))
if b.prev != nil {
res += b.prev.count()
}
return res
}

View File

@@ -3,7 +3,35 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore // This file implements checkpoint persistence for the in-memory metric store.
//
// Checkpoints enable graceful restarts by periodically saving in-memory metric
// data to disk in either JSON or Avro format. The checkpoint system:
//
// Key Features:
// - Periodic background checkpointing via the Checkpointing() worker
// - Two formats: JSON (human-readable) and Avro (compact, efficient)
// - Parallel checkpoint creation and loading using worker pools
// - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{json|avro}
// - Only saves unarchived data (archived data is already persisted elsewhere)
// - Automatic format detection and fallback during loading
// - GC optimization during loading to prevent excessive heap growth
//
// Checkpoint Workflow:
// 1. Init() loads checkpoints within retention window at startup
// 2. Checkpointing() worker periodically saves new data
// 3. Shutdown() writes final checkpoint before exit
//
// File Organization:
//
// checkpoints/
// cluster1/
// host001/
// 1234567890.json (timestamp = checkpoint start time)
// 1234567950.json
// host002/
// ...
package metricstore
import ( import (
"bufio" "bufio"
@@ -23,24 +51,27 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/linkedin/goavro/v2" "github.com/linkedin/goavro/v2"
) )
const ( const (
CheckpointFilePerms = 0o644 CheckpointFilePerms = 0o644 // File permissions for checkpoint files
CheckpointDirPerms = 0o755 CheckpointDirPerms = 0o755 // Directory permissions for checkpoint directories
GCTriggerInterval = DefaultGCTriggerInterval GCTriggerInterval = DefaultGCTriggerInterval // Interval for triggering GC during checkpoint loading
) )
// Whenever changed, update MarshalJSON as well! // CheckpointMetrics represents metric data in a checkpoint file.
// Whenever the structure changes, update MarshalJSON as well!
type CheckpointMetrics struct { type CheckpointMetrics struct {
Data []schema.Float `json:"data"` Data []schema.Float `json:"data"`
Frequency int64 `json:"frequency"` Frequency int64 `json:"frequency"`
Start int64 `json:"start"` Start int64 `json:"start"`
} }
// CheckpointFile represents the hierarchical structure of a checkpoint file.
// It mirrors the Level tree structure from the MemoryStore.
type CheckpointFile struct { type CheckpointFile struct {
Metrics map[string]*CheckpointMetrics `json:"metrics"` Metrics map[string]*CheckpointMetrics `json:"metrics"`
Children map[string]*CheckpointFile `json:"children"` Children map[string]*CheckpointFile `json:"children"`
@@ -48,10 +79,23 @@ type CheckpointFile struct {
To int64 `json:"to"` To int64 `json:"to"`
} }
var lastCheckpoint time.Time // lastCheckpoint tracks the timestamp of the last checkpoint creation.
var (
lastCheckpoint time.Time
lastCheckpointMu sync.Mutex
)
// Checkpointing starts a background worker that periodically saves metric data to disk.
//
// The behavior depends on the configured file format:
// - JSON: Periodic checkpointing based on Keys.Checkpoints.Interval
// - Avro: Initial delay + periodic checkpointing at DefaultAvroCheckpointInterval
//
// The worker respects context cancellation and signals completion via the WaitGroup.
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
lastCheckpointMu.Lock()
lastCheckpoint = time.Now() lastCheckpoint = time.Now()
lastCheckpointMu.Unlock()
if Keys.Checkpoints.FileFormat == "json" { if Keys.Checkpoints.FileFormat == "json" {
ms := GetMemoryStore() ms := GetMemoryStore()
@@ -60,9 +104,10 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
defer wg.Done() defer wg.Done()
d, err := time.ParseDuration(Keys.Checkpoints.Interval) d, err := time.ParseDuration(Keys.Checkpoints.Interval)
if err != nil { if err != nil {
cclog.Fatal(err) cclog.Fatalf("[METRICSTORE]> invalid checkpoint interval '%s': %s", Keys.Checkpoints.Interval, err.Error())
} }
if d <= 0 { if d <= 0 {
cclog.Warnf("[METRICSTORE]> checkpoint interval is zero or negative (%s), checkpointing disabled", d)
return return
} }
@@ -74,15 +119,21 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
case <-ctx.Done(): case <-ctx.Done():
return return
case <-ticker.C: case <-ticker.C:
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", lastCheckpoint.Format(time.RFC3339)) lastCheckpointMu.Lock()
from := lastCheckpoint
lastCheckpointMu.Unlock()
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339))
now := time.Now() now := time.Now()
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
lastCheckpoint.Unix(), now.Unix()) from.Unix(), now.Unix())
if err != nil { if err != nil {
cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error()) cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
} else { } else {
cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n) cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n)
lastCheckpointMu.Lock()
lastCheckpoint = now lastCheckpoint = now
lastCheckpointMu.Unlock()
} }
} }
} }
@@ -113,9 +164,10 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
} }
} }
// As `Float` implements a custom MarshalJSON() function, // MarshalJSON provides optimized JSON encoding for CheckpointMetrics.
// serializing an array of such types has more overhead //
// than one would assume (because of extra allocations, interfaces and so on). // Since schema.Float has custom MarshalJSON, serializing []Float has significant overhead.
// This method manually constructs JSON to avoid allocations and interface conversions.
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) { func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 128+len(cm.Data)*8) buf := make([]byte, 0, 128+len(cm.Data)*8)
buf = append(buf, `{"frequency":`...) buf = append(buf, `{"frequency":`...)
@@ -137,13 +189,27 @@ func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
return buf, nil return buf, nil
} }
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)! // ToCheckpoint writes metric data to checkpoint files in parallel.
// On a per-host basis a new JSON file is created. I have no idea if this will scale. //
// The good thing: Only a host at a time is locked, so this function can run // Metrics at root and cluster levels are skipped. One file per host is created.
// in parallel to writes/reads. // Uses worker pool (Keys.NumWorkers) for parallel processing. Only locks one host
// at a time, allowing concurrent writes/reads to other hosts.
//
// Returns the number of checkpoint files created and any errors encountered.
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) { func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
levels := make([]*Level, 0) // Pre-calculate capacity by counting cluster/host pairs
selectors := make([][]string, 0) m.root.lock.RLock()
totalHosts := 0
for _, l1 := range m.root.children {
l1.lock.RLock()
totalHosts += len(l1.children)
l1.lock.RUnlock()
}
m.root.lock.RUnlock()
levels := make([]*Level, 0, totalHosts)
selectors := make([][]string, 0, totalHosts)
m.root.lock.RLock() m.root.lock.RLock()
for sel1, l1 := range m.root.children { for sel1, l1 := range m.root.children {
l1.lock.RLock() l1.lock.RLock()
@@ -203,6 +269,8 @@ func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
return int(n), nil return int(n), nil
} }
// toCheckpointFile recursively converts a Level tree to CheckpointFile structure.
// Skips metrics that are already archived. Returns nil if no unarchived data exists.
func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) { func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
l.lock.RLock() l.lock.RLock()
defer l.lock.RUnlock() defer l.lock.RUnlock()
@@ -224,6 +292,7 @@ func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFil
b.iterFromTo(from, to, func(b *buffer) error { b.iterFromTo(from, to, func(b *buffer) error {
if !b.archived { if !b.archived {
allArchived = false allArchived = false
return fmt.Errorf("stop") // Early termination signal
} }
return nil return nil
}) })
@@ -267,6 +336,8 @@ func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFil
return retval, nil return retval, nil
} }
// toCheckpoint writes a Level's data to a JSON checkpoint file.
// Creates directory if needed. Returns ErrNoNewArchiveData if nothing to save.
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error { func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
cf, err := l.toCheckpointFile(from, to, m) cf, err := l.toCheckpointFile(from, to, m)
if err != nil { if err != nil {
@@ -278,11 +349,11 @@ func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
} }
filepath := path.Join(dir, fmt.Sprintf("%d.json", from)) filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms) f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
if err != nil && os.IsNotExist(err) { if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(dir, CheckpointDirPerms) err = os.MkdirAll(dir, CheckpointDirPerms)
if err == nil { if err == nil {
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms) f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
} }
} }
if err != nil { if err != nil {
@@ -298,9 +369,54 @@ func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
return bw.Flush() return bw.Flush()
} }
// enqueueCheckpointHosts traverses checkpoint directory and enqueues cluster/host pairs.
// Returns error if directory structure is invalid.
func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
clustersDir, err := os.ReadDir(dir)
if err != nil {
return err
}
gcCounter := 0
for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
return errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
}
hostsDir, err := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if err != nil {
return err
}
for _, hostDir := range hostsDir {
if !hostDir.IsDir() {
return errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
}
gcCounter++
if gcCounter%GCTriggerInterval == 0 {
// Forcing garbage collection runs here regulary during the loading of checkpoints
// will decrease the total heap size after loading everything back to memory is done.
// While loading data, the heap will grow fast, so the GC target size will double
// almost always. By forcing GCs here, we can keep it growing more slowly so that
// at the end, less memory is wasted.
runtime.GC()
}
work <- [2]string{clusterDir.Name(), hostDir.Name()}
}
}
return nil
}
// FromCheckpoint loads checkpoint files from disk into memory in parallel.
//
// Uses worker pool to load cluster/host combinations. Periodically triggers GC
// to prevent excessive heap growth. Returns number of files loaded and any errors.
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) { func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
var wg sync.WaitGroup var wg sync.WaitGroup
work := make(chan [2]string, Keys.NumWorkers) work := make(chan [2]string, Keys.NumWorkers*4)
n, errs := int32(0), int32(0) n, errs := int32(0), int32(0)
wg.Add(Keys.NumWorkers) wg.Add(Keys.NumWorkers)
@@ -319,40 +435,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (
}() }()
} }
i := 0 err := enqueueCheckpointHosts(dir, work)
clustersDir, err := os.ReadDir(dir)
for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
goto done
}
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if e != nil {
err = e
goto done
}
for _, hostDir := range hostsDir {
if !hostDir.IsDir() {
err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
goto done
}
i++
if i%Keys.NumWorkers == 0 && i > GCTriggerInterval {
// Forcing garbage collection runs here regulary during the loading of checkpoints
// will decrease the total heap size after loading everything back to memory is done.
// While loading data, the heap will grow fast, so the GC target size will double
// almost always. By forcing GCs here, we can keep it growing more slowly so that
// at the end, less memory is wasted.
runtime.GC()
}
work <- [2]string{clusterDir.Name(), hostDir.Name()}
}
}
done:
close(work) close(work)
wg.Wait() wg.Wait()
@@ -366,9 +449,11 @@ done:
return int(n), nil return int(n), nil
} }
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)! // FromCheckpointFiles is the main entry point for loading checkpoints at startup.
// This function can only be called once and before the very first write or read. //
// Different host's data is loaded to memory in parallel. // Automatically detects checkpoint format (JSON vs Avro) and falls back if needed.
// Creates checkpoint directory if it doesn't exist. This function must be called
// before any writes or reads, and can only be called once.
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
if _, err := os.Stat(dir); os.IsNotExist(err) { if _, err := os.Stat(dir); os.IsNotExist(err) {
// The directory does not exist, so create it using os.MkdirAll() // The directory does not exist, so create it using os.MkdirAll()
@@ -408,10 +493,10 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
return m.FromCheckpoint(dir, from, altFormat) return m.FromCheckpoint(dir, from, altFormat)
} }
cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory")
return 0, nil return 0, nil
} }
// checkFilesWithExtension walks a directory tree to check if files with the given extension exist.
func checkFilesWithExtension(dir string, extension string) (bool, error) { func checkFilesWithExtension(dir string, extension string) (bool, error) {
found := false found := false
@@ -731,13 +816,24 @@ func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRece
return nums[a.Name()] < nums[b.Name()] return nums[a.Name()] < nums[b.Name()]
}) })
if len(nums) == 0 {
return nil, nil
}
filenames := make([]string, 0) filenames := make([]string, 0)
for i := range direntries {
e := direntries[i] for i, e := range direntries {
ts1 := nums[e.Name()] ts1 := nums[e.Name()]
// Logic to look for files in forward or direction
// If logic: All files greater than or after
// the given timestamp will be selected
// Else If logic: All files less than or before
// the given timestamp will be selected
if findMoreRecentFiles && t <= ts1 { if findMoreRecentFiles && t <= ts1 {
filenames = append(filenames, e.Name()) filenames = append(filenames, e.Name())
} else if !findMoreRecentFiles && ts1 <= t && ts1 != 0 {
filenames = append(filenames, e.Name())
} }
if i == len(direntries)-1 { if i == len(direntries)-1 {
continue continue
@@ -750,10 +846,6 @@ func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRece
if ts1 < t && t < ts2 { if ts1 < t && t < ts2 {
filenames = append(filenames, e.Name()) filenames = append(filenames, e.Name())
} }
} else {
if ts2 < t {
filenames = append(filenames, e.Name())
}
} }
} }

View File

@@ -0,0 +1,257 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstore provides config.go: Configuration structures and metric management.
//
// # Configuration Hierarchy
//
// The metricstore package uses nested configuration structures:
//
// MetricStoreConfig (Keys)
// ├─ NumWorkers: Parallel checkpoint/archive workers
// ├─ RetentionInMemory: How long to keep data in RAM
// ├─ MemoryCap: Memory limit in bytes (triggers forceFree)
// ├─ Checkpoints: Persistence configuration
// │ ├─ FileFormat: "avro" or "json"
// │ ├─ Interval: How often to save (e.g., "1h")
// │ └─ RootDir: Checkpoint storage path
// ├─ Archive: Long-term storage configuration
// │ ├─ ArchiveInterval: How often to archive
// │ ├─ RootDir: Archive storage path
// │ └─ DeleteInstead: Delete old data instead of archiving
// ├─ Debug: Development/debugging options
// └─ Subscriptions: NATS topic subscriptions for metric ingestion
//
// # Metric Configuration
//
// Each metric (e.g., "cpu_load", "mem_used") has a MetricConfig entry in the global
// Metrics map, defining:
//
// - Frequency: Measurement interval in seconds
// - Aggregation: How to combine values (sum/avg/none) when transforming scopes
// - offset: Internal index into Level.metrics slice (assigned during Init)
//
// # AggregationStrategy
//
// Determines how to combine metric values when aggregating from finer to coarser scopes:
//
// - NoAggregation: Do not combine (incompatible scopes)
// - SumAggregation: Add values (e.g., power consumption: core→socket)
// - AvgAggregation: Average values (e.g., temperature: core→socket)
package metricstore
import (
"fmt"
"time"
)
const (
DefaultMaxWorkers = 10
DefaultBufferCapacity = 512
DefaultGCTriggerInterval = 100
DefaultAvroWorkers = 4
DefaultCheckpointBufferMin = 3
DefaultAvroCheckpointInterval = time.Minute
)
// Checkpoints configures periodic persistence of in-memory metric data.
//
// Fields:
// - FileFormat: "avro" (default, binary, compact) or "json" (human-readable, slower)
// - Interval: Duration string (e.g., "1h", "30m") between checkpoint saves
// - RootDir: Filesystem path for checkpoint files (created if missing)
type Checkpoints struct {
FileFormat string `json:"file-format"`
Interval string `json:"interval"`
RootDir string `json:"directory"`
}
// Debug provides development and profiling options.
//
// Fields:
// - DumpToFile: Path to dump checkpoint data for inspection (empty = disabled)
// - EnableGops: Enable gops agent for live runtime debugging (https://github.com/google/gops)
type Debug struct {
DumpToFile string `json:"dump-to-file"`
EnableGops bool `json:"gops"`
}
// Archive configures long-term storage of old metric data.
//
// Data older than RetentionInMemory is archived to disk or deleted.
//
// Fields:
// - ArchiveInterval: Duration string (e.g., "24h") between archive operations
// - RootDir: Filesystem path for archived data (created if missing)
// - DeleteInstead: If true, delete old data instead of archiving (saves disk space)
type Archive struct {
ArchiveInterval string `json:"interval"`
RootDir string `json:"directory"`
DeleteInstead bool `json:"delete-instead"`
}
// Subscriptions defines NATS topics to subscribe to for metric ingestion.
//
// Each subscription receives metrics via NATS messaging, enabling real-time
// data collection from compute nodes.
//
// Fields:
// - SubscribeTo: NATS subject/channel name (e.g., "metrics.compute.*")
// - ClusterTag: Default cluster name for metrics without cluster tag (optional)
type Subscriptions []struct {
// Channel name
SubscribeTo string `json:"subscribe-to"`
// Allow lines without a cluster tag, use this as default, optional
ClusterTag string `json:"cluster-tag"`
}
// MetricStoreConfig defines the main configuration for the metricstore.
//
// Loaded from cc-backend's config.json "metricstore" section. Controls memory usage,
// persistence, archiving, and metric ingestion.
//
// Fields:
// - NumWorkers: Parallel workers for checkpoint/archive (0 = auto: min(NumCPU/2+1, 10))
// - RetentionInMemory: Duration string (e.g., "48h") for in-memory data retention
// - MemoryCap: Max bytes for buffer data (0 = unlimited); triggers forceFree when exceeded
// - Checkpoints: Periodic persistence configuration
// - Debug: Development/profiling options (nil = disabled)
// - Archive: Long-term storage configuration (nil = disabled)
// - Subscriptions: NATS topics for metric ingestion (nil = polling only)
type MetricStoreConfig struct {
// Number of concurrent workers for checkpoint and archive operations.
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
NumWorkers int `json:"num-workers"`
RetentionInMemory string `json:"retention-in-memory"`
MemoryCap int `json:"memory-cap"`
Checkpoints Checkpoints `json:"checkpoints"`
Debug *Debug `json:"debug"`
Archive *Archive `json:"archive"`
Subscriptions *Subscriptions `json:"nats-subscriptions"`
}
// Keys is the global metricstore configuration instance.
//
// Initialized with defaults, then overwritten by cc-backend's config.json.
// Accessed by Init(), Checkpointing(), and other lifecycle functions.
var Keys MetricStoreConfig = MetricStoreConfig{
Checkpoints: Checkpoints{
FileFormat: "avro",
RootDir: "./var/checkpoints",
},
}
// AggregationStrategy defines how to combine metric values across hierarchy levels.
//
// Used when transforming data from finer-grained scopes (e.g., core) to coarser scopes
// (e.g., socket). This is SPATIAL aggregation, not TEMPORAL (time-based) aggregation.
//
// Values:
// - NoAggregation: Do not aggregate (incompatible scopes or non-aggregatable metrics)
// - SumAggregation: Add values (e.g., power: sum core power → socket power)
// - AvgAggregation: Average values (e.g., temperature: average core temps → socket temp)
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota // Do not aggregate
SumAggregation // Sum values (e.g., power, energy)
AvgAggregation // Average values (e.g., temperature, utilization)
)
// AssignAggregationStrategy parses a string into an AggregationStrategy value.
//
// Used when loading metric configurations from JSON/YAML files.
//
// Parameters:
// - str: "sum", "avg", or "" (empty string for NoAggregation)
//
// Returns:
// - AggregationStrategy: Parsed value
// - error: Non-nil if str is unrecognized
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
switch str {
case "":
return NoAggregation, nil
case "sum":
return SumAggregation, nil
case "avg":
return AvgAggregation, nil
default:
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
}
}
// MetricConfig defines configuration for a single metric type.
//
// Stored in the global Metrics map, keyed by metric name (e.g., "cpu_load").
//
// Fields:
// - Frequency: Measurement interval in seconds (e.g., 60 for 1-minute granularity)
// - Aggregation: How to combine values across hierarchy levels (sum/avg/none)
// - offset: Internal index into Level.metrics slice (assigned during Init)
type MetricConfig struct {
// Interval in seconds at which measurements are stored
Frequency int64
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy
// Private, used internally...
offset int
}
// Metrics is the global map of metric configurations.
//
// Keyed by metric name (e.g., "cpu_load", "mem_used"). Populated during Init()
// from cluster configuration and checkpoint restoration. Each MetricConfig.offset
// corresponds to the buffer slice index in Level.metrics.
var Metrics map[string]MetricConfig
// GetMetricFrequency retrieves the measurement interval for a metric.
//
// Parameters:
// - metricName: Metric name (e.g., "cpu_load")
//
// Returns:
// - int64: Frequency in seconds
// - error: Non-nil if metric not found in Metrics map
func GetMetricFrequency(metricName string) (int64, error) {
if metric, ok := Metrics[metricName]; ok {
return metric.Frequency, nil
}
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
}
// AddMetric registers a new metric or updates an existing one.
//
// If the metric already exists with a different frequency, uses the higher frequency
// (finer granularity). This handles cases where different clusters report the same
// metric at different intervals.
//
// Parameters:
// - name: Metric name (e.g., "cpu_load")
// - metric: Configuration (frequency, aggregation strategy)
//
// Returns:
// - error: Always nil (signature for future error handling)
func AddMetric(name string, metric MetricConfig) error {
if Metrics == nil {
Metrics = make(map[string]MetricConfig, 0)
}
if existingMetric, ok := Metrics[name]; ok {
if existingMetric.Frequency != metric.Frequency {
if existingMetric.Frequency < metric.Frequency {
existingMetric.Frequency = metric.Frequency
Metrics[name] = existingMetric
}
}
} else {
Metrics[name] = metric
}
return nil
}

View File

@@ -0,0 +1,77 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricstore
const configSchema = `{
"type": "object",
"description": "Configuration specific to built-in metric-store.",
"properties": {
"num-workers": {
"description": "Number of concurrent workers for checkpoint and archive operations",
"type": "integer"
},
"checkpoints": {
"description": "Configuration for checkpointing the metrics within metric-store",
"type": "object",
"properties": {
"file-format": {
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
"type": "string"
},
"interval": {
"description": "Interval at which the metrics should be checkpointed.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the checkpointed files should be placed.",
"type": "string"
}
},
"required": ["interval"]
},
"archive": {
"description": "Configuration for archiving the already checkpointed files.",
"type": "object",
"properties": {
"interval": {
"description": "Interval at which the checkpointed files should be archived.",
"type": "string"
},
"directory": {
"description": "Specify the directy in which the archived files should be placed.",
"type": "string"
}
},
"required": ["interval", "directory"]
},
"retention-in-memory": {
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
"type": "string"
},
"memory-cap": {
"description": "Upper memory capacity limit used by metricstore in GB",
"type": "integer"
},
"nats-subscriptions": {
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
"type": "array",
"items": {
"type": "object",
"properties": {
"subscribe-to": {
"description": "Channel name",
"type": "string"
},
"cluster-tag": {
"description": "Optional: Allow lines without a cluster tag, use this as default",
"type": "string"
}
}
}
}
},
"required": ["checkpoints", "retention-in-memory", "memory-cap"]
}`

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore package metricstore
import ( import (
"bufio" "bufio"

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore package metricstore
import ( import (
"bufio" "bufio"

View File

@@ -0,0 +1,385 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstore provides level.go: Hierarchical tree structure for metric storage.
//
// # Level Architecture
//
// The Level type forms a tree structure where each node represents a level in the
// ClusterCockpit hierarchy: cluster → host → socket → core → hwthread, with special
// nodes for memory domains and accelerators.
//
// Structure:
//
// Root Level (cluster="emmy")
// ├─ Level (host="node001")
// │ ├─ Level (socket="0")
// │ │ ├─ Level (core="0") [stores cpu0 metrics]
// │ │ └─ Level (core="1") [stores cpu1 metrics]
// │ └─ Level (socket="1")
// │ └─ ...
// └─ Level (host="node002")
// └─ ...
//
// Each Level can:
// - Hold data (metrics slice of buffer pointers)
// - Have child nodes (children map[string]*Level)
// - Both simultaneously (inner nodes can store aggregated metrics)
//
// # Selector Paths
//
// Selectors are hierarchical paths: []string{"cluster", "host", "component"}.
// Example: []string{"emmy", "node001", "cpu0"} navigates to the cpu0 core level.
//
// # Concurrency
//
// RWMutex protects children map and metrics slice. Read-heavy workload (metric reads)
// uses RLock. Writes (new levels, buffer updates) use Lock. Double-checked locking
// prevents races during level creation.
package metricstore
import (
"sync"
"unsafe"
"github.com/ClusterCockpit/cc-lib/v2/util"
)
// Level represents a node in the hierarchical metric storage tree.
//
// Can be both a leaf or inner node. Inner nodes hold data in 'metrics' for aggregated
// values (e.g., socket-level metrics derived from core-level data). Named "Level"
// instead of "node" to avoid confusion with cluster nodes (hosts).
//
// Fields:
// - children: Map of child level names to Level pointers (e.g., "cpu0" → Level)
// - metrics: Slice of buffer pointers (one per metric, indexed by MetricConfig.offset)
// - lock: RWMutex for concurrent access (read-heavy, write-rare)
type Level struct {
children map[string]*Level
metrics []*buffer
lock sync.RWMutex
}
// findLevelOrCreate navigates to or creates the level specified by selector.
//
// Recursively descends the tree, creating missing levels as needed. Uses double-checked
// locking: RLock first (fast path), then Lock if creation needed (slow path), then
// re-check after acquiring Lock to handle races.
//
// Example selector: []string{"emmy", "node001", "cpu0"}
// Navigates: root → emmy → node001 → cpu0, creating levels as needed.
//
// Parameters:
// - selector: Hierarchical path (consumed recursively, decreasing depth)
// - nMetrics: Number of metric slots to allocate in new levels
//
// Returns:
// - *Level: The target level (existing or newly created)
//
// Note: sync.Map may improve performance for high-concurrency writes, but current
// approach suffices for read-heavy workload.
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *Level
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok = l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
// The level does not exist, take write lock for unique access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
child = &Level{
metrics: make([]*buffer, nMetrics),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*Level{selector[0]: child}
}
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
// collectPaths gathers all selector paths at the specified depth in the tree.
//
// Recursively traverses children, collecting paths when currentDepth+1 == targetDepth.
// Each path is a selector that can be used with findLevel() or findBuffers().
//
// Explicitly copies slices to avoid shared underlying arrays between siblings, preventing
// unintended mutations.
//
// Parameters:
// - currentDepth: Depth of current level (0 = root)
// - targetDepth: Depth to collect paths from
// - currentPath: Path accumulated so far
// - results: Output slice (appended to)
//
// Example: collectPaths(0, 2, []string{}, &results) collects all 2-level paths
// like []string{"emmy", "node001"}, []string{"emmy", "node002"}, etc.
func (l *Level) collectPaths(currentDepth, targetDepth int, currentPath []string, results *[][]string) {
l.lock.RLock()
defer l.lock.RUnlock()
for key, child := range l.children {
if child == nil {
continue
}
// We explicitly make a new slice and copy data to avoid sharing underlying arrays between siblings
newPath := make([]string, len(currentPath))
copy(newPath, currentPath)
newPath = append(newPath, key)
// Check depth, and just return if depth reached
if currentDepth+1 == targetDepth {
*results = append(*results, newPath)
} else {
child.collectPaths(currentDepth+1, targetDepth, newPath, results)
}
}
}
// free removes buffers older than the retention threshold from the entire subtree.
//
// Recursively frees buffers in this level's metrics and all child levels. Buffers
// with standard capacity (BufferCap) are returned to the pool. Called by the
// retention worker to enforce retention policies.
//
// Parameters:
// - t: Retention threshold timestamp (Unix seconds)
//
// Returns:
// - int: Total number of buffers freed in this subtree
// - error: Non-nil on failure (propagated from children)
func (l *Level) free(t int64) (int, error) {
l.lock.Lock()
defer l.lock.Unlock()
n := 0
for i, b := range l.metrics {
if b != nil {
delme, m := b.free(t)
n += m
if delme {
if cap(b.data) == BufferCap {
bufferPool.Put(b)
}
l.metrics[i] = nil
}
}
}
for _, l := range l.children {
m, err := l.free(t)
n += m
if err != nil {
return n, err
}
}
return n, nil
}
// forceFree removes the oldest buffer from each metric chain in the subtree.
//
// Unlike free(), which removes based on time threshold, this unconditionally removes
// the oldest buffer in each chain. Used by MemoryUsageTracker when memory cap is
// exceeded and time-based retention is insufficient.
//
// Recursively processes current level's metrics and all child levels.
//
// Returns:
// - int: Total number of buffers freed in this subtree
// - error: Non-nil on failure (propagated from children)
func (l *Level) forceFree() (int, error) {
l.lock.Lock()
defer l.lock.Unlock()
n := 0
// Iterate over metrics in the current level
for i, b := range l.metrics {
if b != nil {
// Attempt to free the oldest buffer in this chain
delme, freedCount := b.forceFreeOldest()
n += freedCount
// If delme is true, it means 'b' itself (the head) was the oldest
// and needs to be removed from the slice.
if delme {
if cap(b.data) == BufferCap {
bufferPool.Put(b)
}
l.metrics[i] = nil
}
}
}
// Recursively traverse children
for _, child := range l.children {
m, err := child.forceFree()
n += m
if err != nil {
return n, err
}
}
return n, nil
}
// sizeInBytes calculates the total memory usage of all buffers in the subtree.
//
// Recursively sums buffer data sizes (count of Float values × sizeof(Float)) across
// this level's metrics and all child levels. Used by MemoryUsageTracker to enforce
// memory cap limits.
//
// Returns:
// - int64: Total bytes used by buffer data in this subtree
func (l *Level) sizeInBytes() int64 {
l.lock.RLock()
defer l.lock.RUnlock()
size := int64(0)
for _, b := range l.metrics {
if b != nil {
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
}
}
for _, child := range l.children {
size += child.sizeInBytes()
}
return size
}
// findLevel navigates to the level specified by selector, returning nil if not found.
//
// Read-only variant of findLevelOrCreate. Does not create missing levels.
// Recursively descends the tree following the selector path.
//
// Parameters:
// - selector: Hierarchical path (e.g., []string{"emmy", "node001", "cpu0"})
//
// Returns:
// - *Level: The target level, or nil if any component in the path does not exist
func (l *Level) findLevel(selector []string) *Level {
if len(selector) == 0 {
return l
}
l.lock.RLock()
defer l.lock.RUnlock()
lvl := l.children[selector[0]]
if lvl == nil {
return nil
}
return lvl.findLevel(selector[1:])
}
// findBuffers invokes callback on all buffers matching the selector pattern.
//
// Supports flexible selector patterns (from cc-lib/util.Selector):
// - Exact match: Selector element with String set (e.g., "node001")
// - Group match: Selector element with Group set (e.g., ["cpu0", "cpu2", "cpu4"])
// - Wildcard: Selector element with Any=true (matches all children)
//
// Empty selector (len==0) matches current level's buffer at 'offset' and recursively
// all descendant buffers at the same offset (used for aggregation queries).
//
// Parameters:
// - selector: Pattern to match (consumed recursively)
// - offset: Metric index in metrics slice (from MetricConfig.offset)
// - f: Callback invoked on each matching buffer
//
// Returns:
// - error: First error returned by callback, or nil if all succeeded
//
// Example:
//
// // Find all cpu0 buffers across all hosts:
// findBuffers([]Selector{{Any: true}, {String: "cpu0"}}, metricOffset, callback)
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
l.lock.RLock()
defer l.lock.RUnlock()
if len(selector) == 0 {
b := l.metrics[offset]
if b != nil {
return f(b)
}
for _, lvl := range l.children {
err := lvl.findBuffers(nil, offset, f)
if err != nil {
return err
}
}
return nil
}
sel := selector[0]
if len(sel.String) != 0 && l.children != nil {
lvl, ok := l.children[sel.String]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
return nil
}
if sel.Group != nil && l.children != nil {
for _, key := range sel.Group {
lvl, ok := l.children[key]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
}
return nil
}
if sel.Any && l.children != nil {
for _, lvl := range l.children {
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
return err
}
}
return nil
}
return nil
}

View File

@@ -3,7 +3,7 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore package metricstore
import ( import (
"context" "context"
@@ -12,8 +12,8 @@ import (
"time" "time"
"github.com/ClusterCockpit/cc-backend/pkg/nats" "github.com/ClusterCockpit/cc-backend/pkg/nats"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/influxdata/line-protocol/v2/lineprotocol" "github.com/influxdata/line-protocol/v2/lineprotocol"
) )
@@ -29,29 +29,30 @@ func ReceiveNats(ms *MemoryStore,
} }
var wg sync.WaitGroup var wg sync.WaitGroup
msgs := make(chan []byte, workers*2) msgs := make(chan []byte, workers*2)
for _, sc := range Keys.Subscriptions { for _, sc := range *Keys.Subscriptions {
clusterTag := sc.ClusterTag clusterTag := sc.ClusterTag
if workers > 1 { if workers > 1 {
wg.Add(workers) wg.Add(workers)
for range workers { for range workers {
go func() { go func() {
defer wg.Done()
for m := range msgs { for m := range msgs {
dec := lineprotocol.NewDecoderWithBytes(m) dec := lineprotocol.NewDecoderWithBytes(m)
if err := DecodeLine(dec, ms, clusterTag); err != nil { if err := DecodeLine(dec, ms, clusterTag); err != nil {
cclog.Errorf("error: %s", err.Error()) cclog.Errorf("error: %s", err.Error())
} }
} }
wg.Done()
}() }()
} }
nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) { nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) {
msgs <- data select {
case msgs <- data:
case <-ctx.Done():
}
}) })
} else { } else {
nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) { nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) {
@@ -64,7 +65,11 @@ func ReceiveNats(ms *MemoryStore,
cclog.Infof("NATS subscription to '%s' established", sc.SubscribeTo) cclog.Infof("NATS subscription to '%s' established", sc.SubscribeTo)
} }
go func() {
<-ctx.Done()
close(msgs) close(msgs)
}()
wg.Wait() wg.Wait()
return nil return nil

View File

@@ -0,0 +1,750 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstore provides an efficient in-memory time-series metric storage system
// with support for hierarchical data organization, checkpointing, and archiving.
//
// The package organizes metrics in a tree structure (cluster → host → component) and
// provides concurrent read/write access to metric data with configurable aggregation strategies.
// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
// and enforcing retention policies.
//
// Key features:
// - In-memory metric storage with configurable retention
// - Hierarchical data organization (selectors)
// - Concurrent checkpoint/archive workers
// - Support for sum and average aggregation
// - NATS integration for metric ingestion
package metricstore
import (
"bytes"
"context"
"encoding/json"
"errors"
"runtime"
"slices"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/resampler"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
)
var (
singleton sync.Once
msInstance *MemoryStore
// shutdownFunc stores the context cancellation function created in Init
// and is called during Shutdown to cancel all background goroutines
shutdownFunc context.CancelFunc
shutdownFuncMu sync.Mutex // Protects shutdownFunc from concurrent access
)
// NodeProvider provides information about nodes currently in use by running jobs.
//
// This interface allows metricstore to query job information without directly
// depending on the repository package, breaking the import cycle.
//
// Implementations should return nodes that are actively processing jobs started
// before the given timestamp. These nodes will be excluded from retention-based
// garbage collection to prevent data loss for jobs that are still running or
// recently completed.
type NodeProvider interface {
// GetUsedNodes returns a map of cluster names to sorted lists of unique hostnames
// that are currently in use by jobs that started before the given timestamp.
//
// Parameters:
// - ts: Unix timestamp threshold - returns nodes with jobs started before this time
//
// Returns:
// - Map of cluster names to lists of node hostnames that should be excluded from garbage collection
// - Error if the query fails
GetUsedNodes(ts int64) (map[string][]string, error)
}
// Metric represents a single metric data point to be written to the store.
type Metric struct {
Name string
Value schema.Float
// MetricConfig contains frequency and aggregation settings for this metric.
// If Frequency is 0, configuration will be looked up from MemoryStore.Metrics during Write().
MetricConfig MetricConfig
}
// MemoryStore is the main in-memory time-series metric storage implementation.
//
// It organizes metrics in a hierarchical tree structure where each level represents
// a component of the system hierarchy (e.g., cluster → host → CPU). Each level can
// store multiple metrics as time-series buffers.
//
// The store is initialized as a singleton via InitMetrics() and accessed via GetMemoryStore().
// All public methods are safe for concurrent use.
type MemoryStore struct {
Metrics map[string]MetricConfig
root Level
nodeProvider NodeProvider
}
// Init initializes the metric store from configuration and starts background workers.
//
// This function must be called exactly once before any other metricstore operations.
// It performs the following initialization steps:
// 1. Validates and decodes the metric store configuration
// 2. Configures worker pool size (defaults to NumCPU/2+1, max 10)
// 3. Loads metric configurations from all registered clusters
// 4. Restores checkpoints within the retention window
// 5. Starts background workers for retention, checkpointing, archiving, and monitoring
// 6. Optionally subscribes to NATS for real-time metric ingestion
//
// Parameters:
// - rawConfig: JSON configuration for the metric store (see MetricStoreConfig)
// - wg: WaitGroup that will be incremented for each background goroutine started
//
// The function will call cclog.Fatal on critical errors during initialization.
// Use Shutdown() to cleanly stop all background workers started by Init().
//
// Note: Signal handling must be implemented by the caller. Call Shutdown() when
// receiving termination signals to ensure checkpoint data is persisted.
func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
startupTime := time.Now()
if rawConfig != nil {
config.Validate(configSchema, rawConfig)
dec := json.NewDecoder(bytes.NewReader(rawConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
}
}
// Set NumWorkers from config or use default
if Keys.NumWorkers <= 0 {
Keys.NumWorkers = min(runtime.NumCPU()/2+1, DefaultMaxWorkers)
}
cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers)
// Helper function to add metric configuration
addMetricConfig := func(mc *schema.MetricConfig) {
agg, err := AssignAggregationStrategy(mc.Aggregation)
if err != nil {
cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error())
}
AddMetric(mc.Name, MetricConfig{
Frequency: int64(mc.Timestep),
Aggregation: agg,
})
}
for _, c := range archive.Clusters {
for _, mc := range c.MetricConfig {
addMetricConfig(mc)
}
for _, sc := range c.SubClusters {
for _, mc := range sc.MetricConfig {
addMetricConfig(mc)
}
}
}
// Pass the config.MetricStoreKeys
InitMetrics(Metrics)
ms := GetMemoryStore()
d, err := time.ParseDuration(Keys.RetentionInMemory)
if err != nil {
cclog.Fatal(err)
}
restoreFrom := startupTime.Add(-d)
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
if err != nil {
cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
}
// Try to use less memory by forcing a GC run here and then
// lowering the target percentage. The default of 100 means
// that only once the ratio of new allocations execeds the
// previously active heap, a GC is triggered.
// Forcing a GC here will set the "previously active heap"
// to a minumum.
runtime.GC()
ctx, shutdown := context.WithCancel(context.Background())
retentionGoroutines := 1
checkpointingGoroutines := 1
dataStagingGoroutines := 1
archivingGoroutines := 1
memoryUsageTracker := 1
totalGoroutines := retentionGoroutines +
checkpointingGoroutines +
dataStagingGoroutines +
archivingGoroutines +
memoryUsageTracker
wg.Add(totalGoroutines)
Retention(wg, ctx)
Checkpointing(wg, ctx)
Archiving(wg, ctx)
DataStaging(wg, ctx)
MemoryUsageTracker(wg, ctx)
// Note: Signal handling has been removed from this function.
// The caller is responsible for handling shutdown signals and calling
// the shutdown() function when appropriate.
// Store the shutdown function for later use by Shutdown()
shutdownFuncMu.Lock()
shutdownFunc = shutdown
shutdownFuncMu.Unlock()
if Keys.Subscriptions != nil {
err = ReceiveNats(ms, 1, ctx)
if err != nil {
cclog.Fatal(err)
}
}
}
// InitMetrics initializes the singleton MemoryStore instance with the given metric configurations.
//
// This function must be called before GetMemoryStore() and can only be called once due to
// the singleton pattern. It assigns each metric an internal offset for efficient buffer indexing.
//
// Parameters:
// - metrics: Map of metric names to their configurations (frequency and aggregation strategy)
//
// Panics if any metric has Frequency == 0, which indicates an invalid configuration.
//
// After this call, the global msInstance is ready for use via GetMemoryStore().
func InitMetrics(metrics map[string]MetricConfig) {
singleton.Do(func() {
offset := 0
for key, cfg := range metrics {
if cfg.Frequency == 0 {
panic("[METRICSTORE]> invalid frequency")
}
metrics[key] = MetricConfig{
Frequency: cfg.Frequency,
Aggregation: cfg.Aggregation,
offset: offset,
}
offset += 1
}
msInstance = &MemoryStore{
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
Metrics: metrics,
}
})
}
// GetMemoryStore returns the singleton MemoryStore instance.
//
// Returns the initialized MemoryStore singleton. Calls cclog.Fatal if InitMetrics() was not called first.
//
// This function is safe for concurrent use after initialization.
func GetMemoryStore() *MemoryStore {
if msInstance == nil {
cclog.Fatalf("[METRICSTORE]> MemoryStore not initialized!")
}
return msInstance
}
// SetNodeProvider sets the NodeProvider implementation for the MemoryStore.
// This must be called during initialization to provide job state information
// for selective buffer retention during Free operations.
// If not set, the Free function will fall back to freeing all buffers.
func (ms *MemoryStore) SetNodeProvider(provider NodeProvider) {
ms.nodeProvider = provider
}
// Shutdown performs a graceful shutdown of the metric store.
//
// This function cancels all background goroutines started by Init() and writes
// a final checkpoint to disk before returning. It should be called when the
// application receives a termination signal.
//
// The function will:
// 1. Cancel the context to stop all background workers
// 2. Close NATS message channels if using Avro format
// 3. Write a final checkpoint to preserve in-memory data
// 4. Log any errors encountered during shutdown
//
// Note: This function blocks until the final checkpoint is written.
func Shutdown() {
shutdownFuncMu.Lock()
defer shutdownFuncMu.Unlock()
if shutdownFunc != nil {
shutdownFunc()
}
if Keys.Checkpoints.FileFormat != "json" {
close(LineProtocolMessages)
}
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
var files int
var err error
ms := GetMemoryStore()
if Keys.Checkpoints.FileFormat == "json" {
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
} else {
files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
}
if err != nil {
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
}
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
}
// Retention starts a background goroutine that periodically frees old metric data.
//
// This worker runs at half the retention interval and calls Free() to remove buffers
// older than the configured retention time. It respects the NodeProvider to preserve
// data for nodes with active jobs.
//
// Parameters:
// - wg: WaitGroup to signal completion when context is cancelled
// - ctx: Context for cancellation signal
//
// The goroutine exits when ctx is cancelled.
func Retention(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.RetentionInMemory)
if err != nil {
cclog.Fatal(err)
}
if d <= 0 {
return
}
tickInterval := d / 2
if tickInterval <= 0 {
return
}
ticker := time.NewTicker(tickInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
t := time.Now().Add(-d)
cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
freed, err := Free(ms, t)
if err != nil {
cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
}
}
}
}()
}
// MemoryUsageTracker starts a background goroutine that monitors memory usage.
//
// This worker checks memory usage every minute and force-frees buffers if memory
// exceeds the configured cap. It protects against infinite loops by limiting
// iterations and forcing garbage collection between attempts.
//
// Parameters:
// - wg: WaitGroup to signal completion when context is cancelled
// - ctx: Context for cancellation signal
//
// The goroutine exits when ctx is cancelled.
func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d := 1 * time.Minute
if d <= 0 {
return
}
ticker := time.NewTicker(d)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
memoryUsageGB := ms.SizeInGB()
cclog.Infof("[METRICSTORE]> current memory usage: %.2f GB\n", memoryUsageGB)
if memoryUsageGB > float64(Keys.MemoryCap) {
cclog.Warnf("[METRICSTORE]> current memory usage is greater than the Memory Cap: %d GB\n", Keys.MemoryCap)
cclog.Warnf("[METRICSTORE]> starting to force-free the buffers from the Metric Store\n")
freedTotal := 0
const maxIterations = 100
for range maxIterations {
memoryUsageGB = ms.SizeInGB()
if memoryUsageGB < float64(Keys.MemoryCap) {
break
}
freed, err := ms.ForceFree()
if err != nil {
cclog.Errorf("[METRICSTORE]> error while force-freeing the buffers: %s", err)
}
if freed == 0 {
cclog.Errorf("[METRICSTORE]> 0 buffers force-freed in last try, %d total buffers force-freed, memory usage of %.2f GB remains higher than the memory cap of %d GB and there are no buffers left to force-free\n", freedTotal, memoryUsageGB, Keys.MemoryCap)
break
}
freedTotal += freed
runtime.GC()
}
if memoryUsageGB >= float64(Keys.MemoryCap) {
cclog.Errorf("[METRICSTORE]> reached maximum iterations (%d) or no more buffers to free, current memory usage: %.2f GB\n", maxIterations, memoryUsageGB)
} else {
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freedTotal)
cclog.Infof("[METRICSTORE]> current memory usage after force-freeing the buffers: %.2f GB\n", memoryUsageGB)
}
}
}
}
}()
}
// Free removes metric data older than the given time while preserving data for active nodes.
//
// This function implements intelligent retention by consulting the NodeProvider (if configured)
// to determine which nodes are currently in use by running jobs. Data for these nodes is
// preserved even if older than the retention time.
//
// Parameters:
// - ms: The MemoryStore instance
// - t: Time threshold - buffers with data older than this will be freed
//
// Returns:
// - Number of buffers freed
// - Error if NodeProvider query fails
//
// Behavior:
// - If no NodeProvider is set: frees all buffers older than t
// - If NodeProvider returns empty map: frees all buffers older than t
// - Otherwise: preserves buffers for nodes returned by GetUsedNodes(), frees others
func Free(ms *MemoryStore, t time.Time) (int, error) {
// If no NodeProvider is configured, free all buffers older than t
if ms.nodeProvider == nil {
return ms.Free(nil, t.Unix())
}
excludeSelectors, err := ms.nodeProvider.GetUsedNodes(t.Unix())
if err != nil {
return 0, err
}
switch lenMap := len(excludeSelectors); lenMap {
// If the length of the map returned by GetUsedNodes() is 0,
// then use default Free method with nil selector
case 0:
return ms.Free(nil, t.Unix())
// Else formulate selectors, exclude those from the map
// and free the rest of the selectors
default:
selectors := GetSelectors(ms, excludeSelectors)
return FreeSelected(ms, selectors, t)
}
}
// FreeSelected frees buffers for specific selectors while preserving others.
//
// This function is used when we want to retain some specific nodes beyond the retention time.
// It iterates through the provided selectors and frees their associated buffers.
//
// Parameters:
// - ms: The MemoryStore instance
// - selectors: List of selector paths to free (e.g., [["cluster1", "node1"], ["cluster2", "node2"]])
// - t: Time threshold for freeing buffers
//
// Returns the total number of buffers freed and any error encountered.
func FreeSelected(ms *MemoryStore, selectors [][]string, t time.Time) (int, error) {
freed := 0
for _, selector := range selectors {
freedBuffers, err := ms.Free(selector, t.Unix())
if err != nil {
cclog.Errorf("error while freeing selected buffers: %#v", err)
}
freed += freedBuffers
}
return freed, nil
}
// GetSelectors returns all selectors at depth 2 (cluster/node level) that are NOT in the exclusion map.
//
// This function generates a list of selectors whose buffers should be freed by excluding
// selectors that correspond to nodes currently in use by running jobs.
//
// Parameters:
// - ms: The MemoryStore instance
// - excludeSelectors: Map of cluster names to node hostnames that should NOT be freed
//
// Returns a list of selectors ([]string paths) that can be safely freed.
//
// Example:
//
// If the tree has paths ["emmy", "node001"] and ["emmy", "node002"],
// and excludeSelectors contains {"emmy": ["node001"]},
// then only [["emmy", "node002"]] is returned.
func GetSelectors(ms *MemoryStore, excludeSelectors map[string][]string) [][]string {
allSelectors := ms.GetPaths(2)
filteredSelectors := make([][]string, 0, len(allSelectors))
for _, path := range allSelectors {
if len(path) < 2 {
continue
}
key := path[0] // The "Key" (Level 1)
value := path[1] // The "Value" (Level 2)
exclude := false
// Check if the key exists in our exclusion map
if excludedValues, exists := excludeSelectors[key]; exists {
// The key exists, now check if the specific value is in the exclusion list
if slices.Contains(excludedValues, value) {
exclude = true
}
}
if !exclude {
filteredSelectors = append(filteredSelectors, path)
}
}
return filteredSelectors
}
// GetPaths returns a list of lists (paths) to the specified depth.
func (ms *MemoryStore) GetPaths(targetDepth int) [][]string {
var results [][]string
// Start recursion. Initial path is empty.
// We treat Root as depth 0.
ms.root.collectPaths(0, targetDepth, []string{}, &results)
return results
}
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
// Look at `findLevelOrCreate` for how selectors work.
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
var ok bool
for i, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
metric.MetricConfig, ok = m.Metrics[metric.Name]
if !ok {
cclog.Debugf("[METRICSTORE]> Unknown metric '%s' in Write() - skipping", metric.Name)
metric.MetricConfig.Frequency = 0
}
metrics[i] = metric
}
}
return m.WriteToLevel(&m.root, selector, ts, metrics)
}
func (m *MemoryStore) GetLevel(selector []string) *Level {
return m.root.findLevelOrCreate(selector, len(m.Metrics))
}
// WriteToLevel assumes that `minfo` in `metrics` is filled in
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
l = l.findLevelOrCreate(selector, len(m.Metrics))
l.lock.Lock()
defer l.lock.Unlock()
for _, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
continue
}
b := l.metrics[metric.MetricConfig.offset]
if b == nil {
// First write to this metric and level
b = newBuffer(ts, metric.MetricConfig.Frequency)
l.metrics[metric.MetricConfig.offset] = b
}
nb, err := b.write(ts, metric.Value)
if err != nil {
return err
}
// Last write created a new buffer...
if b != nb {
l.metrics[metric.MetricConfig.offset] = nb
}
}
return nil
}
// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
// The second and third return value are the actual from/to for the data. Those can be different from
// the range asked for if no data was available.
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
if from > to {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
}
minfo, ok := m.Metrics[metric]
if !ok {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> unknown metric: " + metric)
}
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
cdata, cfrom, cto, err := b.read(from, to, data)
if err != nil {
return err
}
if n == 0 {
from, to = cfrom, cto
} else if from != cfrom || to != cto || len(data) != len(cdata) {
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
if missingfront != 0 {
return ErrDataDoesNotAlign
}
newlen := len(cdata) - missingback
if newlen < 1 {
return ErrDataDoesNotAlign
}
cdata = cdata[0:newlen]
if len(cdata) != len(data) {
return ErrDataDoesNotAlign
}
from, to = cfrom, cto
}
data = cdata
n += 1
return nil
})
if err != nil {
return nil, 0, 0, 0, err
} else if n == 0 {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
} else if n > 1 {
if minfo.Aggregation == AvgAggregation {
normalize := 1. / schema.Float(n)
for i := 0; i < len(data); i++ {
data[i] *= normalize
}
} else if minfo.Aggregation != SumAggregation {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
}
}
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
if err != nil {
return nil, 0, 0, 0, err
}
return data, from, to, resolution, nil
}
// Free releases all buffers for the selected level and all its children that
// contain only values older than `t`.
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
return m.GetLevel(selector).free(t)
}
// Free releases all buffers for the selected level and all its children that
// contain only values older than `t`.
func (m *MemoryStore) ForceFree() (int, error) {
return m.GetLevel(nil).forceFree()
}
func (m *MemoryStore) FreeAll() error {
for k := range m.root.children {
delete(m.root.children, k)
}
return nil
}
func (m *MemoryStore) SizeInBytes() int64 {
return m.root.sizeInBytes()
}
func (m *MemoryStore) SizeInGB() float64 {
return float64(m.root.sizeInBytes()) / 1e9
}
// ListChildren , given a selector, returns a list of all children of the level
// selected.
func (m *MemoryStore) ListChildren(selector []string) []string {
lvl := &m.root
for lvl != nil && len(selector) != 0 {
lvl.lock.RLock()
next := lvl.children[selector[0]]
lvl.lock.RUnlock()
lvl = next
selector = selector[1:]
}
if lvl == nil {
return nil
}
lvl.lock.RLock()
defer lvl.lock.RUnlock()
children := make([]string, 0, len(lvl.children))
for child := range lvl.children {
children = append(children, child)
}
return children
}

View File

@@ -3,12 +3,12 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore package metricstore
import ( import (
"testing" "testing"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
func TestAssignAggregationStrategy(t *testing.T) { func TestAssignAggregationStrategy(t *testing.T) {

View File

@@ -3,7 +3,28 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore // This file implements high-level query functions for loading job metric data
// with automatic scope transformation and aggregation.
//
// Key Concepts:
//
// Metric Scopes: Metrics are collected at different granularities (native scope):
// - HWThread: Per hardware thread
// - Core: Per CPU core
// - Socket: Per CPU socket
// - MemoryDomain: Per memory domain (NUMA)
// - Accelerator: Per GPU/accelerator
// - Node: Per compute node
//
// Scope Transformation: The buildQueries functions transform between native scope
// and requested scope by:
// - Aggregating finer-grained data (e.g., HWThread → Core → Socket → Node)
// - Rejecting requests for finer granularity than available
// - Handling special cases (e.g., Accelerator metrics)
//
// Query Building: Constructs APIQuery structures with proper selectors (Type, TypeIds)
// based on cluster topology and job resources.
package metricstore
import ( import (
"context" "context"
@@ -13,10 +34,37 @@ import (
"time" "time"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
// TestLoadDataCallback allows tests to override LoadData behavior for testing purposes.
// When set to a non-nil function, LoadData will call this function instead of the default implementation.
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
// LoadData loads metric data for a specific job with automatic scope transformation.
//
// This is the primary function for retrieving job metric data. It handles:
// - Building queries with scope transformation via buildQueries
// - Fetching data from the metric store
// - Organizing results by metric and scope
// - Converting NaN statistics to 0 for JSON compatibility
// - Partial error handling (returns data for successful queries even if some fail)
//
// Parameters:
// - job: Job metadata including cluster, resources, and time range
// - metrics: List of metric names to load
// - scopes: Requested metric scopes (will be transformed to match native scopes)
// - ctx: Context for cancellation (currently unused but reserved for future use)
// - resolution: Data resolution in seconds (0 for native resolution)
//
// Returns:
// - JobData: Map of metric → scope → JobMetric with time-series data and statistics
// - Error: Returns error if query building or fetching fails, or partial error listing failed hosts
//
// Example:
//
// jobData, err := LoadData(job, []string{"cpu_load", "mem_used"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 60)
func LoadData( func LoadData(
job *schema.Job, job *schema.Job,
metrics []string, metrics []string,
@@ -24,6 +72,10 @@ func LoadData(
ctx context.Context, ctx context.Context,
resolution int, resolution int,
) (schema.JobData, error) { ) (schema.JobData, error) {
if TestLoadDataCallback != nil {
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
}
queries, assignedScope, err := buildQueries(job, metrics, scopes, int64(resolution)) queries, assignedScope, err := buildQueries(job, metrics, scopes, int64(resolution))
if err != nil { if err != nil {
cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
@@ -84,12 +136,7 @@ func LoadData(
*id = query.TypeIds[ndx] *id = query.TypeIds[ndx]
} }
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { sanitizeStats(&res)
// "schema.Float()" because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
jobMetric.Series = append(jobMetric.Series, schema.Series{ jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: query.Hostname, Hostname: query.Hostname,
@@ -119,6 +166,10 @@ func LoadData(
return jobData, nil return jobData, nil
} }
// Pre-converted scope strings avoid repeated string(MetricScope) allocations during
// query construction. These are used in APIQuery.Type field throughout buildQueries
// and buildNodeQueries functions. Converting once at package initialization improves
// performance for high-volume query building.
var ( var (
hwthreadString = string(schema.MetricScopeHWThread) hwthreadString = string(schema.MetricScopeHWThread)
coreString = string(schema.MetricScopeCore) coreString = string(schema.MetricScopeCore)
@@ -127,12 +178,41 @@ var (
acceleratorString = string(schema.MetricScopeAccelerator) acceleratorString = string(schema.MetricScopeAccelerator)
) )
// buildQueries constructs APIQuery structures with automatic scope transformation for a job.
//
// This function implements the core scope transformation logic, handling all combinations of
// native metric scopes and requested scopes. It uses the cluster topology to determine which
// hardware IDs to include in each query.
//
// Scope Transformation Rules:
// - If native scope >= requested scope: Aggregates data (Aggregate=true in APIQuery)
// - If native scope < requested scope: Returns error (cannot increase granularity)
// - Special handling for Accelerator scope (independent of CPU hierarchy)
//
// The function generates one or more APIQuery per (metric, scope, host) combination:
// - For non-aggregated queries: One query with all relevant IDs
// - For aggregated queries: May generate multiple queries (e.g., one per socket/core)
//
// Parameters:
// - job: Job metadata including cluster, subcluster, and resource allocation
// - metrics: List of metrics to query
// - scopes: Requested scopes for each metric
// - resolution: Data resolution in seconds
//
// Returns:
// - []APIQuery: List of queries to execute
// - []schema.MetricScope: Assigned scope for each query (after transformation)
// - error: Returns error if topology lookup fails or unhandled scope combination encountered
func buildQueries( func buildQueries(
job *schema.Job, job *schema.Job,
metrics []string, metrics []string,
scopes []schema.MetricScope, scopes []schema.MetricScope,
resolution int64, resolution int64,
) ([]APIQuery, []schema.MetricScope, error) { ) ([]APIQuery, []schema.MetricScope, error) {
if len(job.Resources) == 0 {
return nil, nil, fmt.Errorf("METRICDATA/CCMS > no resources allocated for job %d", job.JobID)
}
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
assignedScope := []schema.MetricScope{} assignedScope := []schema.MetricScope{}
@@ -145,7 +225,6 @@ func buildQueries(
for _, metric := range metrics { for _, metric := range metrics {
mc := archive.GetMetricConfig(job.Cluster, metric) mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil { if mc == nil {
// return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
cclog.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) cclog.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
continue continue
} }
@@ -164,10 +243,9 @@ func buildQueries(
} }
} }
// Avoid duplicates... // Avoid duplicates using map for O(1) lookup
handledScopes := make([]schema.MetricScope, 0, 3) handledScopes := make(map[schema.MetricScope]bool, 3)
scopesLoop:
for _, requestedScope := range scopes { for _, requestedScope := range scopes {
nativeScope := mc.Scope nativeScope := mc.Scope
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 { if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
@@ -175,12 +253,10 @@ func buildQueries(
} }
scope := nativeScope.Max(requestedScope) scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes { if handledScopes[scope] {
if scope == s { continue
continue scopesLoop
} }
} handledScopes[scope] = true
handledScopes = append(handledScopes, scope)
for _, host := range job.Resources { for _, host := range job.Resources {
hwthreads := host.HWThreads hwthreads := host.HWThreads
@@ -225,7 +301,7 @@ func buildQueries(
continue continue
} }
// HWThread -> HWThead // HWThread -> HWThread
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
queries = append(queries, APIQuery{ queries = append(queries, APIQuery{
Metric: metric, Metric: metric,
@@ -349,7 +425,7 @@ func buildQueries(
continue continue
} }
// MemoryDoman -> Node // MemoryDomain -> Node
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
queries = append(queries, APIQuery{ queries = append(queries, APIQuery{
@@ -413,12 +489,26 @@ func buildQueries(
return queries, assignedScope, nil return queries, assignedScope, nil
} }
// LoadStats loads only metric statistics (avg/min/max) for a job at node scope.
//
// This is an optimized version of LoadData that fetches only statistics without
// time-series data, reducing bandwidth and memory usage. Always queries at node scope.
//
// Parameters:
// - job: Job metadata
// - metrics: List of metric names
// - ctx: Context (currently unused)
//
// Returns:
// - Map of metric → hostname → statistics
// - Error on query building or fetching failure
func LoadStats( func LoadStats(
job *schema.Job, job *schema.Job,
metrics []string, metrics []string,
ctx context.Context, ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) { ) (map[string]map[string]schema.MetricStatistics, error) {
queries, _, err := buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization? // TODO(#166): Add scope parameter for analysis view accelerator normalization
queries, _, err := buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0)
if err != nil { if err != nil {
cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error()) cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error())
return nil, err return nil, err
@@ -470,6 +560,20 @@ func LoadStats(
return stats, nil return stats, nil
} }
// LoadScopedStats loads metric statistics for a job with scope-aware grouping.
//
// Similar to LoadStats but supports multiple scopes and returns statistics grouped
// by scope with hardware IDs (e.g., per-core, per-socket statistics).
//
// Parameters:
// - job: Job metadata
// - metrics: List of metric names
// - scopes: Requested metric scopes
// - ctx: Context (currently unused)
//
// Returns:
// - ScopedJobStats: Map of metric → scope → []ScopedStats (with hostname and ID)
// - Error or partial error listing failed queries
func LoadScopedStats( func LoadScopedStats(
job *schema.Job, job *schema.Job,
metrics []string, metrics []string,
@@ -526,12 +630,7 @@ func LoadScopedStats(
*id = query.TypeIds[ndx] *id = query.TypeIds[ndx]
} }
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { sanitizeStats(&res)
// "schema.Float()" because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: query.Hostname, Hostname: query.Hostname,
@@ -560,6 +659,22 @@ func LoadScopedStats(
return scopedJobStats, nil return scopedJobStats, nil
} }
// LoadNodeData loads metric data for specific nodes in a cluster over a time range.
//
// Unlike LoadData which operates on job resources, this function queries arbitrary nodes
// directly. Useful for system monitoring and node status views.
//
// Parameters:
// - cluster: Cluster name
// - metrics: List of metric names
// - nodes: List of node hostnames (nil = all nodes in cluster via ForAllNodes)
// - scopes: Requested metric scopes (currently unused - always node scope)
// - from, to: Time range
// - ctx: Context (currently unused)
//
// Returns:
// - Map of hostname → metric → []JobMetric
// - Error or partial error listing failed queries
func LoadNodeData( func LoadNodeData(
cluster string, cluster string,
metrics, nodes []string, metrics, nodes []string,
@@ -608,14 +723,10 @@ func LoadNodeData(
metric := query.Metric metric := query.Metric
qdata := res[0] qdata := res[0]
if qdata.Error != nil { if qdata.Error != nil {
/* Build list for "partial errors", if any */
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error)) errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
} }
if qdata.Avg.IsNaN() || qdata.Min.IsNaN() || qdata.Max.IsNaN() { sanitizeStats(&qdata)
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
qdata.Avg, qdata.Min, qdata.Max = 0., 0., 0.
}
hostdata, ok := data[query.Hostname] hostdata, ok := data[query.Hostname]
if !ok { if !ok {
@@ -649,6 +760,24 @@ func LoadNodeData(
return data, nil return data, nil
} }
// LoadNodeListData loads metric data for a list of nodes with full scope transformation support.
//
// This is the most flexible node data loading function, supporting arbitrary scopes and
// resolution. Uses buildNodeQueries for proper scope transformation based on topology.
//
// Parameters:
// - cluster: Cluster name
// - subCluster: SubCluster name (empty string to infer from node names)
// - nodes: List of node hostnames
// - metrics: List of metric names
// - scopes: Requested metric scopes
// - resolution: Data resolution in seconds
// - from, to: Time range
// - ctx: Context (currently unused)
//
// Returns:
// - Map of hostname → JobData (metric → scope → JobMetric)
// - Error or partial error listing failed queries
func LoadNodeListData( func LoadNodeListData(
cluster, subCluster string, cluster, subCluster string,
nodes []string, nodes []string,
@@ -689,7 +818,7 @@ func LoadNodeListData(
} else { } else {
query = req.Queries[i] query = req.Queries[i]
} }
// qdata := res[0]
metric := query.Metric metric := query.Metric
scope := assignedScope[i] scope := assignedScope[i]
mc := archive.GetMetricConfig(cluster, metric) mc := archive.GetMetricConfig(cluster, metric)
@@ -735,12 +864,7 @@ func LoadNodeListData(
*id = query.TypeIds[ndx] *id = query.TypeIds[ndx]
} }
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { sanitizeStats(&res)
// "schema.Float()" because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
scopeData.Series = append(scopeData.Series, schema.Series{ scopeData.Series = append(scopeData.Series, schema.Series{
Hostname: query.Hostname, Hostname: query.Hostname,
@@ -763,6 +887,23 @@ func LoadNodeListData(
return data, nil return data, nil
} }
// buildNodeQueries constructs APIQuery structures for node-based queries with scope transformation.
//
// Similar to buildQueries but operates on node lists rather than job resources.
// Supports dynamic subcluster lookup when subCluster parameter is empty.
//
// Parameters:
// - cluster: Cluster name
// - subCluster: SubCluster name (empty = infer from node hostnames)
// - nodes: List of node hostnames
// - metrics: List of metric names
// - scopes: Requested metric scopes
// - resolution: Data resolution in seconds
//
// Returns:
// - []APIQuery: List of queries to execute
// - []schema.MetricScope: Assigned scope for each query
// - error: Returns error if topology lookup fails or unhandled scope combination
func buildNodeQueries( func buildNodeQueries(
cluster string, cluster string,
subCluster string, subCluster string,
@@ -771,6 +912,10 @@ func buildNodeQueries(
scopes []schema.MetricScope, scopes []schema.MetricScope,
resolution int64, resolution int64,
) ([]APIQuery, []schema.MetricScope, error) { ) ([]APIQuery, []schema.MetricScope, error) {
if len(nodes) == 0 {
return nil, nil, fmt.Errorf("METRICDATA/CCMS > no nodes specified for query")
}
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes)) queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
assignedScope := []schema.MetricScope{} assignedScope := []schema.MetricScope{}
@@ -788,7 +933,6 @@ func buildNodeQueries(
for _, metric := range metrics { for _, metric := range metrics {
mc := archive.GetMetricConfig(cluster, metric) mc := archive.GetMetricConfig(cluster, metric)
if mc == nil { if mc == nil {
// return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, cluster)
cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster) cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster)
continue continue
} }
@@ -807,20 +951,17 @@ func buildNodeQueries(
} }
} }
// Avoid duplicates... // Avoid duplicates using map for O(1) lookup
handledScopes := make([]schema.MetricScope, 0, 3) handledScopes := make(map[schema.MetricScope]bool, 3)
scopesLoop:
for _, requestedScope := range scopes { for _, requestedScope := range scopes {
nativeScope := mc.Scope nativeScope := mc.Scope
scope := nativeScope.Max(requestedScope) scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes { if handledScopes[scope] {
if scope == s { continue
continue scopesLoop
} }
} handledScopes[scope] = true
handledScopes = append(handledScopes, scope)
for _, hostname := range nodes { for _, hostname := range nodes {
@@ -843,7 +984,7 @@ func buildNodeQueries(
// Moved check here if metric matches hardware specs // Moved check here if metric matches hardware specs
if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 { if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
continue scopesLoop continue
} }
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node) // Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
@@ -883,7 +1024,7 @@ func buildNodeQueries(
continue continue
} }
// HWThread -> HWThead // HWThread -> HWThread
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
queries = append(queries, APIQuery{ queries = append(queries, APIQuery{
Metric: metric, Metric: metric,
@@ -1007,7 +1148,7 @@ func buildNodeQueries(
continue continue
} }
// MemoryDoman -> Node // MemoryDomain -> Node
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node) sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node)
queries = append(queries, APIQuery{ queries = append(queries, APIQuery{
@@ -1071,10 +1212,37 @@ func buildNodeQueries(
return queries, assignedScope, nil return queries, assignedScope, nil
} }
// sanitizeStats converts NaN statistics to zero for JSON compatibility.
//
// schema.Float with NaN values cannot be properly JSON-encoded, so we convert
// NaN to 0. This loses the distinction between "no data" and "zero value",
// but maintains API compatibility.
func sanitizeStats(data *APIMetricData) {
if data.Avg.IsNaN() {
data.Avg = schema.Float(0)
}
if data.Min.IsNaN() {
data.Min = schema.Float(0)
}
if data.Max.IsNaN() {
data.Max = schema.Float(0)
}
}
// intToStringSlice converts a slice of integers to a slice of strings.
// Used to convert hardware thread/core/socket IDs from topology (int) to APIQuery TypeIds (string).
//
// Optimized to reuse a byte buffer for string conversion, reducing allocations.
func intToStringSlice(is []int) []string { func intToStringSlice(is []int) []string {
if len(is) == 0 {
return nil
}
ss := make([]string, len(is)) ss := make([]string, len(is))
buf := make([]byte, 0, 16) // Reusable buffer for integer conversion
for i, x := range is { for i, x := range is {
ss[i] = strconv.Itoa(x) buf = strconv.AppendInt(buf[:0], int64(x), 10)
ss[i] = string(buf)
} }
return ss return ss
} }

View File

@@ -3,13 +3,13 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package memorystore package metricstore
import ( import (
"errors" "errors"
"math" "math"
"github.com/ClusterCockpit/cc-lib/util" "github.com/ClusterCockpit/cc-lib/v2/util"
) )
type Stats struct { type Stats struct {

View File

@@ -12,7 +12,7 @@ import (
"sync" "sync"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
"github.com/mattn/go-sqlite3" "github.com/mattn/go-sqlite3"
"github.com/qustavo/sqlhooks/v2" "github.com/qustavo/sqlhooks/v2"
@@ -115,3 +115,26 @@ func GetConnection() *DBConnection {
return dbConnInstance return dbConnInstance
} }
// ResetConnection closes the current database connection and resets the connection state.
// This function is intended for testing purposes only to allow test isolation.
func ResetConnection() error {
if dbConnInstance != nil && dbConnInstance.DB != nil {
if err := dbConnInstance.DB.Close(); err != nil {
return fmt.Errorf("failed to close database connection: %w", err)
}
}
dbConnInstance = nil
dbConnOnce = sync.Once{}
jobRepoInstance = nil
jobRepoOnce = sync.Once{}
nodeRepoInstance = nil
nodeRepoOnce = sync.Once{}
userRepoInstance = nil
userRepoOnce = sync.Once{}
userCfgRepoInstance = nil
userCfgRepoOnce = sync.Once{}
return nil
}

Some files were not shown because too many files have changed in this diff Show More