1126 Commits

Author SHA1 Message Date
d12da655e9 Only register PullWorker task if it was configured 2025-12-21 13:39:21 +01:00
50df63a2d2 Remove metricData usage
Is replaced by builtin memorystore API
2025-12-21 13:29:43 +01:00
d23f20f42a Remove InternalCCMSFlag 2025-12-21 08:12:36 +01:00
965561956e Move ccms api to memorystore and make it default. Rename metricDataDispatcher. Refactor and document. 2025-12-21 07:34:17 +01:00
5a65044caf Introduce updstream TS pull 2025-12-21 06:34:17 +01:00
1cd4a57bd3 Remove support for mysql/mariadb 2025-12-20 11:13:41 +01:00
b35172e2f7 Add context information for CLAUDE coding agent 2025-12-20 11:13:02 +01:00
3cfcd30128 Add CLAUDE.md documentation for Claude Code
Provides architecture overview, build commands, and development workflows
to help future Claude Code instances work productively in this codebase.
Includes guidance on GraphQL/REST API patterns, database migrations, and
the repository/metric data architecture.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-20 10:17:54 +01:00
e56532e5c8 Add example json API payloads 2025-12-20 09:35:54 +01:00
fdee4f8938 Integrate NATS API.
Only start either REST start/stop API or NATS start/stop API
2025-12-20 09:21:58 +01:00
Christoph Kluge
7acc89e42d move public dash close button 2025-12-19 17:52:21 +01:00
Christoph Kluge
af7d208c21 remove unused class 2025-12-19 16:16:57 +01:00
Christoph Kluge
91b90d033e fix metric select drag and drop 2025-12-19 15:27:35 +01:00
Christoph Kluge
7a0975b94d final fix render race condition if metrics change in nodeList 2025-12-19 15:10:15 +01:00
Christoph Kluge
c58b01a602 fix wrong render condition order in nodeList 2025-12-19 14:42:02 +01:00
Christoph Kluge
8244449646 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-12-18 15:55:40 +01:00
Christoph Kluge
436afa4a61 fix tag count by including type in grouping 2025-12-18 15:55:30 +01:00
06ed056d43 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-18 15:47:53 +01:00
d446c13546 Restore startDemo script 2025-12-18 15:47:51 +01:00
6e74fa294a Add role-based visibility for metrics
Fixes #387
2025-12-18 15:47:30 +01:00
Christoph Kluge
43bdb56072 add fallback case if metric has no name in nodeListRow 2025-12-18 15:04:03 +01:00
e707fd0893 Provide fallback in archive manager in case fd is not available 2025-12-18 11:26:05 +01:00
Christoph Kluge
19c8e9beb1 move extensive NodeMetricsList handling to node repo func 2025-12-18 10:44:58 +01:00
Aditya Ujeniya
32e5353847 Fix to NATS deadlock and revert demo script 2025-12-17 18:14:36 +01:00
Aditya Ujeniya
d2f2d78954 Changing JWT output to stdout and change to help text 2025-12-17 15:58:42 +01:00
b8fdfc30c0 Fix performance bugs in sqlite archive backend 2025-12-17 10:12:49 +01:00
79a2ca8ae8 Adapt unit test to new API 2025-12-17 08:44:37 +01:00
d1a78c13a4 Make loglevel info default for demo 2025-12-17 08:38:14 +01:00
f4b00e9de1 Use Info instead of warn loglevel for database file missing msg 2025-12-17 08:38:00 +01:00
0a5e155096 Remove debug setting 2025-12-17 07:03:10 +01:00
4ecc050c4c Fix deadlock if NATS is not configured 2025-12-17 07:03:01 +01:00
88dc5036b3 Make import function interuptible and replace countJobs with external call to fd 2025-12-17 06:32:53 +01:00
d30c6ef3bf Make NATS API subjects configurable 2025-12-17 06:08:09 +01:00
0419fec810 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-17 05:46:10 +01:00
43e5fd1131 Add NATS API backend 2025-12-17 05:44:49 +01:00
Christoph Kluge
11e94124cc improve handling and layout if missing data in dashboard 2025-12-16 15:43:57 +01:00
Christoph Kluge
102109388b link to public dashboard in admin options, add return button do public dashboard 2025-12-16 13:54:17 +01:00
Jan Eitzinger
60a69aa0a2 Merge pull request #453 from ClusterCockpit/status_dashboard
Status dashboard
2025-12-16 10:04:49 +01:00
5e2cbd75fa Review and refactor 2025-12-16 09:45:48 +01:00
14f1192ccb Introduce central nats client 2025-12-16 09:35:33 +01:00
72b2560ecf Add progress bar for import function 2025-12-16 09:11:26 +01:00
7fce6fa401 Parallelize the Iter function in all archive backends 2025-12-16 09:11:09 +01:00
e6286768a7 Refactor variabel naming and update doc comments 2025-12-16 08:56:48 +01:00
0306723307 Introduce transparent compression for importJob function in all archive backends 2025-12-16 08:55:31 +01:00
6f49998ad3 Switch to new go tool pattern for external tool deps 2025-12-16 08:49:17 +01:00
457c944ec6 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-15 21:25:32 +01:00
33c38f9464 Fix start time in tasks 2025-12-15 21:25:30 +01:00
46351389b6 Add ai agent guidelines 2025-12-15 21:25:00 +01:00
Christoph Kluge
d56b0e93db cleanup routes, cleanup root components 2025-12-15 15:10:10 +01:00
d567a5312e Add flag omitTagged to DeleteJobsBefore
Fixes #344
2025-12-15 14:38:46 +01:00
97a322354f Refactor 2025-12-15 14:06:33 +01:00
554527445b Merge branch 'master' into dev 2025-12-15 13:56:41 +01:00
Christoph Kluge
c5aff1a2ca add autorefresh, remove leftover query 2025-12-15 13:55:02 +01:00
987cc40318 Refactor 2025-12-15 13:50:05 +01:00
104fd1576a Refactor 2025-12-15 13:44:50 +01:00
72ce3954b4 feat: Add omitTagged flag for retention services
Fixes #344
2025-12-15 13:44:17 +01:00
cfa7461855 Refactor 2025-12-15 13:25:41 +01:00
44cda8a232 Add flag to obmit tagged jobs from TestFindJobsBetween 2025-12-15 13:25:22 +01:00
cf119e6843 Also initialize job-archive on init flag
Fixes #378
2025-12-15 12:59:12 +01:00
Jan Eitzinger
451744f321 Merge pull request #447 from ClusterCockpit/dependabot/go_modules/github.com/prometheus/common-0.67.4
Bump github.com/prometheus/common from 0.66.1 to 0.67.4
2025-12-15 12:34:17 +01:00
Jan Eitzinger
ca6682b94b Merge pull request #446 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2-1.40.1
Bump github.com/aws/aws-sdk-go-v2 from 1.39.6 to 1.40.1
2025-12-15 12:32:55 +01:00
dependabot[bot]
cbad2341c3 Bump github.com/aws/aws-sdk-go-v2 from 1.39.6 to 1.40.1
Bumps [github.com/aws/aws-sdk-go-v2](https://github.com/aws/aws-sdk-go-v2) from 1.39.6 to 1.40.1.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/v1.39.6...v1.40.1)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2
  dependency-version: 1.40.1
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-15 11:29:02 +00:00
dependabot[bot]
a956c7b135 Bump github.com/prometheus/common from 0.66.1 to 0.67.4
Bumps [github.com/prometheus/common](https://github.com/prometheus/common) from 0.66.1 to 0.67.4.
- [Release notes](https://github.com/prometheus/common/releases)
- [Changelog](https://github.com/prometheus/common/blob/main/CHANGELOG.md)
- [Commits](https://github.com/prometheus/common/compare/v0.66.1...v0.67.4)

---
updated-dependencies:
- dependency-name: github.com/prometheus/common
  dependency-version: 0.67.4
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-15 11:28:21 +00:00
Jan Eitzinger
ea6caeb2f0 Merge pull request #445 from ClusterCockpit/dependabot/go_modules/github.com/golang-migrate/migrate/v4-4.19.1
Bump github.com/golang-migrate/migrate/v4 from 4.18.2 to 4.19.1
2025-12-15 12:27:34 +01:00
Jan Eitzinger
c17e8b1156 Merge pull request #444 from ClusterCockpit/dependabot/go_modules/github.com/linkedin/goavro/v2-2.14.1
Bump github.com/linkedin/goavro/v2 from 2.14.0 to 2.14.1
2025-12-15 12:27:14 +01:00
Jan Eitzinger
b993b1e096 Merge pull request #443 from ClusterCockpit/dependabot/go_modules/github.com/go-co-op/gocron/v2-2.18.2
Bump github.com/go-co-op/gocron/v2 from 2.17.0 to 2.18.2
2025-12-15 12:24:41 +01:00
d7d81e352d Update cc-lib to v1.0.0 and fix bug in init 2025-12-15 12:20:42 +01:00
078c608bda Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-15 11:25:44 +01:00
f2e57f9edd Update gitignore 2025-12-15 11:25:42 +01:00
5698d5216f Reformat 2025-12-15 11:24:56 +01:00
10aa2bfbd3 Add support for ClusterConfig 2025-12-15 11:24:12 +01:00
6cfed989ff Fix bugs in sqlite backend 2025-12-15 11:23:53 +01:00
ab70acd582 Also import ClusterConfigs 2025-12-15 11:20:49 +01:00
Christoph Kluge
79e1c236fe cleanup, adapt internalDash, remove debug query value 2025-12-12 17:51:54 +01:00
Aditya Ujeniya
fed62b6c45 Functionality to configure resampling 2025-12-12 14:51:01 +01:00
Christoph Kluge
0d62181272 move roofline elements below series data render 2025-12-12 11:19:37 +01:00
Christoph Kluge
290a71bd48 Merge branch 'dev' into status_dashboard 2025-12-11 18:56:09 +01:00
Christoph Kluge
6e385db378 color roofline plot, add option to match pie and table color for ndoestate 2025-12-11 18:51:19 +01:00
Jan Eitzinger
ffe8329b84 Merge pull request #448 from ClusterCockpit/dev
Dev
2025-12-11 11:27:33 +01:00
f13be109c2 Fix: Replace all Printf log messages with appropriate loglevels 2025-12-11 11:20:11 +01:00
d24d85b970 Adapt tests to new API 2025-12-11 09:39:38 +01:00
8d44ac90ad Fix: Busywait loop in archiver and slow shutdown
Remove unblocking default in select
Add shutdown handler with context and timeout
2025-12-11 09:29:10 +01:00
Christoph Kluge
4083de2a51 Add public dashboard and route, add DoubleMetricPlot and GQL queries
- add roofline legend display switch
- small fixes
2025-12-09 10:26:55 +01:00
dependabot[bot]
131df075db Bump github.com/golang-migrate/migrate/v4 from 4.18.2 to 4.19.1
Bumps [github.com/golang-migrate/migrate/v4](https://github.com/golang-migrate/migrate) from 4.18.2 to 4.19.1.
- [Release notes](https://github.com/golang-migrate/migrate/releases)
- [Commits](https://github.com/golang-migrate/migrate/compare/v4.18.2...v4.19.1)

---
updated-dependencies:
- dependency-name: github.com/golang-migrate/migrate/v4
  dependency-version: 4.19.1
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-08 08:05:24 +00:00
dependabot[bot]
afd6f50ba2 Bump github.com/linkedin/goavro/v2 from 2.14.0 to 2.14.1
Bumps [github.com/linkedin/goavro/v2](https://github.com/linkedin/goavro) from 2.14.0 to 2.14.1.
- [Release notes](https://github.com/linkedin/goavro/releases)
- [Commits](https://github.com/linkedin/goavro/compare/v2.14.0...v2.14.1)

---
updated-dependencies:
- dependency-name: github.com/linkedin/goavro/v2
  dependency-version: 2.14.1
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-08 08:05:19 +00:00
dependabot[bot]
ad01366705 Bump github.com/go-co-op/gocron/v2 from 2.17.0 to 2.18.2
Bumps [github.com/go-co-op/gocron/v2](https://github.com/go-co-op/gocron) from 2.17.0 to 2.18.2.
- [Release notes](https://github.com/go-co-op/gocron/releases)
- [Commits](https://github.com/go-co-op/gocron/compare/v2.17.0...v2.18.2)

---
updated-dependencies:
- dependency-name: github.com/go-co-op/gocron/v2
  dependency-version: 2.18.2
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-08 08:05:11 +00:00
6325793902 Add check in initDB importer if jobMeta is nil 2025-12-04 15:38:21 +01:00
Jan Eitzinger
8ea176f9da Merge pull request #441 from ClusterCockpit/dev
Dev
2025-12-04 15:16:02 +01:00
03b5272e44 Upgrade to latest cc-lib 2025-12-04 15:08:22 +01:00
7da01975f7 archive-migration: Add check for archive version and rewrite version after migration 2025-12-04 15:08:03 +01:00
7cff8bbfd2 Add documentation for importer 2025-12-04 15:07:09 +01:00
Jan Eitzinger
c98cbb33f8 Merge pull request #440 from ClusterCockpit/dev
Dev
2025-12-04 10:46:11 +01:00
f3ea95535b Remove init functionlity from Makefile 2025-12-04 10:24:33 +01:00
b9b84b7971 Use --init flag in startDemo script 2025-12-04 07:43:55 +01:00
be7340ca30 archive-migration leanup and fix path in README.md 2025-12-04 07:43:26 +01:00
881c4566dd Reformat and remove optional ui-config 2025-12-04 07:42:55 +01:00
7efbb0217f Update config for ccbackend --init 2025-12-04 07:42:26 +01:00
9e2ce39cde Update startDemo script 2025-12-04 07:03:01 +01:00
Jan Eitzinger
0ff6cae1c3 Merge pull request #438 from rpabel/master
return directly on error
2025-12-04 06:40:43 +01:00
Jan Eitzinger
d02ba3d717 Merge pull request #439 from ClusterCockpit/dev
Dev
2025-12-04 06:38:16 +01:00
6aa830adb6 Merge dependabot and upgrade dependencies 2025-12-04 06:34:40 +01:00
be6603cbb9 Merge branch 'master' into dev 2025-12-04 06:29:48 +01:00
Jan Eitzinger
8d208929d5 Merge pull request #434 from ClusterCockpit/dependabot/go_modules/github.com/go-ldap/ldap/v3-3.4.12
Bump github.com/go-ldap/ldap/v3 from 3.4.10 to 3.4.12
2025-12-04 06:24:59 +01:00
Jan Eitzinger
cb0f96b737 Merge pull request #433 from ClusterCockpit/dependabot/go_modules/github.com/nats-io/nats.go-1.47.0
Bump github.com/nats-io/nats.go from 1.46.1 to 1.47.0
2025-12-04 06:24:26 +01:00
Jan Eitzinger
83723ab050 Merge pull request #432 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/graphql-16.12.0
Bump graphql from 16.11.0 to 16.12.0 in /web/frontend
2025-12-04 06:23:59 +01:00
Jan Eitzinger
3abaefa550 Merge pull request #430 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/rollup/plugin-commonjs-29.0.0
Bump @rollup/plugin-commonjs from 28.0.3 to 29.0.0 in /web/frontend
2025-12-04 06:23:11 +01:00
Jan Eitzinger
389010dbbd Merge pull request #429 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/chart.js-4.5.1
Bump chart.js from 4.4.9 to 4.5.1 in /web/frontend
2025-12-04 06:22:44 +01:00
81fe2c043e Upgrade to latest cc-lib 2025-12-04 06:21:28 +01:00
c76e9bb3fe Fix error in unit test 2025-12-04 06:20:45 +01:00
48b68d3410 Fix aws endpoint deprecation 2025-12-04 06:20:19 +01:00
2b64b31393 Merge branch 'ai-review' into dev 2025-12-03 15:01:18 +01:00
2333068de7 Add tools for archive manager and archive-migration
Needs testing and review
2025-12-03 14:55:22 +01:00
78530029ef Reformat 2025-12-03 14:54:48 +01:00
329b6e5640 Review refactored code 2025-12-03 14:54:09 +01:00
Christoph Kluge
967f0a3294 remove non required update trigger 2025-11-26 11:00:41 +01:00
Christoph Kluge
6eb779d359 update frontend dependencies, remove deprecated rollup options 2025-11-25 15:49:04 +01:00
Christoph Kluge
414147177a Vacuum not allowed within a migration transaction 2025-11-24 13:30:25 +01:00
Christoph Kluge
3b37f3630c add vacuum statement to migration
- reduces DB size after job table migration
2025-11-24 13:21:14 +01:00
Christoph Kluge
7c1a818582 fix outdated condition in findJobFootprintThresholds 2025-11-21 16:05:47 +01:00
Christoph Kluge
c4cf7e9707 Recreate job table indices after copy, add node table timstamp indices 2025-11-21 13:44:05 +01:00
Christoph Kluge
1ceb681521 fix missed out keyword 2025-11-20 15:58:40 +01:00
Roland Pabel
443176a0d1 return directly on error 2025-11-20 15:09:53 +01:00
Christoph Kluge
261905a364 unify out_of_memory jobState spelling
- underscores used in existing databases
2025-11-20 15:08:22 +01:00
e00288b160 Cleanup dependencies 2025-11-20 14:28:25 +01:00
f141ca926f Increase archive version. Fix unit tests. 2025-11-20 14:28:06 +01:00
f7a0954213 Fix init order. Reformat. 2025-11-20 14:26:27 +01:00
Christoph Kluge
da8d562eba fix hardcoded dev variable 2025-11-20 13:25:09 +01:00
Christoph Kluge
399af8592c switch nodeList logic to SQLite as source of truth, fix nodeList continuous scroll
- keep notindb logic for now
2025-11-20 12:18:13 +01:00
6239e7f19b Merge branch 'dev' into ai-review 2025-11-20 08:59:52 +01:00
d0e1b7186c Add perl script for CC logfile analysis 2025-11-20 07:51:33 +01:00
fea3292f50 Add idea directory to gitignore 2025-11-20 07:50:10 +01:00
9973aa9ffa Refactor api package 2025-11-20 07:48:45 +01:00
0b38a980d2 Port importer to new transaction api 2025-11-20 07:39:16 +01:00
20838b6882 Add documentation to repository package 2025-11-20 07:38:54 +01:00
8f4ef1e274 Refactor repository
Fix issues
Improve transaction API
Make hardcoded constants configurable
Make error messages consistent and always add context info
2025-11-20 06:58:45 +01:00
e1c7583670 Add sqlite and s3 job archive backend
Add documentation
Extend config
2025-11-19 17:00:11 +01:00
39a2157d46 Refactor tagger package
Fix issues
Add documentation
Add unit tests
2025-11-19 16:58:48 +01:00
dd63e7157a Refactor memorystore
Fix issues
Add unit test
Add documentation
2025-11-19 16:58:02 +01:00
340efd7926 Refactor auth package
Fix security issues
Remove redundant code
Add documentation
Add units tests
2025-11-19 16:54:01 +01:00
ecc6194b57 Refactor main package
Fix issues.
Break down main routine.
Add documentation.
Remove globals.
2025-11-19 16:53:04 +01:00
Christoph Kluge
90c3381954 add nodeState info display and filtering to systems views 2025-11-18 15:56:55 +01:00
Christoph Kluge
21334c8026 make active metrics reactive to cluster filter 2025-11-14 14:45:22 +01:00
Christoph Kluge
cbdef6ce9e fix missing rooflines in analysis heatmap plot 2025-11-14 14:02:16 +01:00
Christoph Kluge
591cd9fd66 review analysis view layout, add title with info 2025-11-14 11:28:48 +01:00
Christoph Kluge
e8d2a45afb add allocated cores gauge to status view, fix stacked labels 2025-11-14 10:40:42 +01:00
Christoph Kluge
3b533938a6 review status view components, make node states refreshable 2025-11-13 17:27:41 +01:00
Christoph Kluge
9fe342a7e9 fix metricSelect error if cluster filter is active 2025-11-13 13:40:31 +01:00
Christoph Kluge
2152ced97a improve metricplot threshold handling
- simplified and adaptive thresholds for shared jobs
2025-11-13 11:18:40 +01:00
Christoph Kluge
404be5f317 add optional legend flip to plots 2025-11-12 17:20:50 +01:00
Christoph Kluge
f56783a439 add plot cursor sync to nodelist rows 2025-11-12 16:44:49 +01:00
Christoph Kluge
fb278182d3 add schedulerState resolver 2025-11-12 13:50:09 +01:00
Christoph Kluge
c2c63d2f67 fix backend node queries
- wrong table name
- wrong scan field count: timestamp catch for log
2025-11-12 13:38:58 +01:00
Christoph Kluge
7f740455fe fix old gql field name 2025-11-12 13:09:31 +01:00
Christoph Kluge
946b992746 fix leftover dev variable 2025-11-12 12:46:00 +01:00
Christoph Kluge
a6c43e6f2f finalize timed node state frontend code for status view 2025-11-11 17:03:59 +01:00
Christoph Kluge
ecad52c18d fix: fix defautl time range select values 2025-11-06 11:15:11 +01:00
Christoph Kluge
e49e5a0474 finalize timed node state backend code, concat functions 2025-11-05 18:17:29 +01:00
dependabot[bot]
9231b3cfca Bump github.com/go-ldap/ldap/v3 from 3.4.10 to 3.4.12
Bumps [github.com/go-ldap/ldap/v3](https://github.com/go-ldap/ldap) from 3.4.10 to 3.4.12.
- [Release notes](https://github.com/go-ldap/ldap/releases)
- [Commits](https://github.com/go-ldap/ldap/compare/v3.4.10...v3.4.12)

---
updated-dependencies:
- dependency-name: github.com/go-ldap/ldap/v3
  dependency-version: 3.4.12
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:08:39 +00:00
dependabot[bot]
68e0159292 Bump github.com/nats-io/nats.go from 1.46.1 to 1.47.0
Bumps [github.com/nats-io/nats.go](https://github.com/nats-io/nats.go) from 1.46.1 to 1.47.0.
- [Release notes](https://github.com/nats-io/nats.go/releases)
- [Commits](https://github.com/nats-io/nats.go/compare/v1.46.1...v1.47.0)

---
updated-dependencies:
- dependency-name: github.com/nats-io/nats.go
  dependency-version: 1.47.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:08:35 +00:00
dependabot[bot]
1a674590bf Bump graphql from 16.11.0 to 16.12.0 in /web/frontend
Bumps [graphql](https://github.com/graphql/graphql-js) from 16.11.0 to 16.12.0.
- [Release notes](https://github.com/graphql/graphql-js/releases)
- [Commits](https://github.com/graphql/graphql-js/compare/v16.11.0...v16.12.0)

---
updated-dependencies:
- dependency-name: graphql
  dependency-version: 16.12.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:03:48 +00:00
dependabot[bot]
1ef47e7b3f Bump @rollup/plugin-commonjs from 28.0.3 to 29.0.0 in /web/frontend
Bumps [@rollup/plugin-commonjs](https://github.com/rollup/plugins/tree/HEAD/packages/commonjs) from 28.0.3 to 29.0.0.
- [Changelog](https://github.com/rollup/plugins/blob/master/packages/commonjs/CHANGELOG.md)
- [Commits](https://github.com/rollup/plugins/commits/commonjs-v29.0.0/packages/commonjs)

---
updated-dependencies:
- dependency-name: "@rollup/plugin-commonjs"
  dependency-version: 29.0.0
  dependency-type: direct:development
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:03:36 +00:00
dependabot[bot]
214a2762df Bump chart.js from 4.4.9 to 4.5.1 in /web/frontend
Bumps [chart.js](https://github.com/chartjs/Chart.js) from 4.4.9 to 4.5.1.
- [Release notes](https://github.com/chartjs/Chart.js/releases)
- [Commits](https://github.com/chartjs/Chart.js/compare/v4.4.9...v4.5.1)

---
updated-dependencies:
- dependency-name: chart.js
  dependency-version: 4.5.1
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:03:30 +00:00
cb5d06decd Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-10-29 09:01:05 +01:00
8555a88202 Upgrade dependencies. Regenerate GraphQL API. 2025-10-29 08:55:06 +01:00
Michael Panzlaff
2287f4493a Reapply "Fix wrong memorystore nats schema"
This reverts commit ea7660ddb3.
2025-10-28 13:17:00 +01:00
Jan Eitzinger
bb357f7cab Merge pull request #420 from ClusterCockpit/dependabot/go_modules/github.com/99designs/gqlgen-0.17.81
Bump github.com/99designs/gqlgen from 0.17.66 to 0.17.81
2025-10-28 12:19:23 +01:00
dependabot[bot]
d9b240cd2d Bump github.com/99designs/gqlgen from 0.17.66 to 0.17.81
Bumps [github.com/99designs/gqlgen](https://github.com/99designs/gqlgen) from 0.17.66 to 0.17.81.
- [Release notes](https://github.com/99designs/gqlgen/releases)
- [Changelog](https://github.com/99designs/gqlgen/blob/master/CHANGELOG.md)
- [Commits](https://github.com/99designs/gqlgen/compare/v0.17.66...v0.17.81)

---
updated-dependencies:
- dependency-name: github.com/99designs/gqlgen
  dependency-version: 0.17.81
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-28 11:17:27 +00:00
Jan Eitzinger
bea5ee96d9 Merge pull request #422 from ClusterCockpit/dependabot/go_modules/github.com/coreos/go-oidc/v3-3.16.0
Bump github.com/coreos/go-oidc/v3 from 3.12.0 to 3.16.0
2025-10-28 12:16:36 +01:00
Jan Eitzinger
7d205fd526 Merge pull request #423 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/mathjs-15.0.0
Bump mathjs from 12.4.3 to 15.0.0 in /web/frontend
2025-10-28 12:16:08 +01:00
Jan Eitzinger
c15b2a0cbb Merge pull request #424 from ClusterCockpit/dependabot/go_modules/golang.org/x/oauth2-0.32.0
Bump golang.org/x/oauth2 from 0.27.0 to 0.32.0
2025-10-28 12:15:35 +01:00
dependabot[bot]
7ccba30a3d Bump mathjs from 12.4.3 to 15.0.0 in /web/frontend
Bumps [mathjs](https://github.com/josdejong/mathjs) from 12.4.3 to 15.0.0.
- [Changelog](https://github.com/josdejong/mathjs/blob/develop/HISTORY.md)
- [Commits](https://github.com/josdejong/mathjs/compare/v12.4.3...v15.0.0)

---
updated-dependencies:
- dependency-name: mathjs
  dependency-version: 15.0.0
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-28 11:13:03 +00:00
dependabot[bot]
8091485588 Bump golang.org/x/oauth2 from 0.27.0 to 0.32.0
Bumps [golang.org/x/oauth2](https://github.com/golang/oauth2) from 0.27.0 to 0.32.0.
- [Commits](https://github.com/golang/oauth2/compare/v0.27.0...v0.32.0)

---
updated-dependencies:
- dependency-name: golang.org/x/oauth2
  dependency-version: 0.32.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-28 11:12:58 +00:00
dependabot[bot]
1413f968d6 Bump github.com/coreos/go-oidc/v3 from 3.12.0 to 3.16.0
Bumps [github.com/coreos/go-oidc/v3](https://github.com/coreos/go-oidc) from 3.12.0 to 3.16.0.
- [Release notes](https://github.com/coreos/go-oidc/releases)
- [Commits](https://github.com/coreos/go-oidc/compare/v3.12.0...v3.16.0)

---
updated-dependencies:
- dependency-name: github.com/coreos/go-oidc/v3
  dependency-version: 3.16.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-28 11:12:55 +00:00
Jan Eitzinger
d1d1bb09e9 Merge pull request #427 from ClusterCockpit/dev
Pre-Merge 1.5 dev
2025-10-28 12:11:49 +01:00
Aditya Ujeniya
3c1a7e0171 Fixed the behavior of avro write to old files 2025-10-28 09:42:28 +01:00
Jan Eitzinger
0cb50f2f01 Add warning for master branch usability
Added warning about the master branch not being production-ready.
2025-10-28 09:11:23 +01:00
Aditya Ujeniya
2287586700 Revert avro files writing logic 2025-10-28 08:53:43 +01:00
Aditya Ujeniya
ea7660ddb3 Revert "Fix wrong memorystore nats schema"
This reverts commit 856ccbb969.
2025-10-28 08:50:33 +01:00
Aditya Ujeniya
44e98e8f2f Fix to avro reader 2025-10-27 20:44:40 +01:00
Michael Panzlaff
856ccbb969 Fix wrong memorystore nats schema 2025-10-27 14:53:18 +01:00
Aditya Ujeniya
0920286b4c Clean up 2025-10-23 17:58:56 +02:00
Aditya Ujeniya
f34e10cfd9 Schema for metric store 2025-10-23 17:58:17 +02:00
ae5d202661 Remove S3Backend stub 2025-10-23 15:14:28 +02:00
bc43c844fc Fix memoryStore Init and move MetricConfig init 2025-10-20 10:22:40 +02:00
67be9aa27b Refactor
Port logging to cclog, use loglevels
Separate REST API from pkg API
2025-10-19 09:33:40 +02:00
047b997a22 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-10-18 08:30:47 +02:00
bac51891b7 Move avro into memorystore. Refactor
Does not compile
2025-10-18 08:30:42 +02:00
Christoph Kluge
714d6af7cd initial branch commit, improve countstate backend logic
- stacked component rough sketch
- gql data request pipeline layed out
2025-10-17 18:24:05 +02:00
6efd6334bb Fix unit tests 2025-10-17 07:06:31 +02:00
91f4475d76 Update test db 2025-10-17 07:05:45 +02:00
Christoph Kluge
de309784b4 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-10-16 15:33:59 +02:00
Christoph Kluge
a623cf53f3 revert leftover notfoundhandler experiments 2025-10-16 15:33:56 +02:00
440cd59e50 Revert hpc_cluster to cluster. Refactor. 2025-10-16 14:32:06 +02:00
eefb6f6265 Cleanup after merge 2025-10-16 13:21:22 +02:00
f5e1226837 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-10-16 13:01:35 +02:00
151f7e701f Disable userConfig unit test 2025-10-16 12:54:29 +02:00
40398497c2 Update Node table code. Add simple unit test 2025-10-16 12:54:16 +02:00
Christoph Kluge
cda10788fb adapt migrated indices to new database structure, include node tables, update job indices 2025-10-15 10:46:24 +02:00
Christoph Kluge
845905d9c8 remove inspect commands for dev 2025-10-15 10:35:35 +02:00
89055506d6 Revert changes to ui config init 2025-10-15 08:54:16 +02:00
Christoph Kluge
5908ae7905 adapt status node query resolution to new node_state table 2025-10-14 18:45:05 +02:00
Christoph Kluge
4131665284 remove gql auto comment 2025-10-14 18:43:16 +02:00
Christoph Kluge
6a43dfb0d7 Fix missing model.Aggregate entry, fix status queries and refresh 2025-10-14 18:43:00 +02:00
3d38d78845 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-10-13 16:13:53 +02:00
600f19ac80 Sync commit
Does not work yet
2025-10-13 16:12:02 +02:00
Christoph Kluge
0a3a664653 small fixes, set analysisView config defaults 2025-10-09 16:33:14 +02:00
Christoph Kluge
471ec1cd2e change deprecated defaultMetrics loader to new confkey
- see PR #333
2025-10-08 18:24:41 +02:00
Christoph Kluge
e296cd7ca0 add web init with uiconfig file path, add cli flag 2025-10-08 16:25:50 +02:00
Christoph Kluge
31cfa8cd7c fix typo for tagEditDisplay 2025-10-08 12:58:02 +02:00
Christoph Kluge
70fe8aa367 fix systemsView config laod and mutation, fix metricSelection checked 2025-10-07 15:46:16 +02:00
Christoph Kluge
cc9dafac6f fix sq.Update call 2025-10-02 18:10:58 +02:00
Christoph Kluge
32429f1481 adapt frontend for new uiConfig keys, add nodeOverview mutation 2025-10-02 18:10:33 +02:00
9485a463b8 Refactor node repository 2025-09-30 10:07:07 +02:00
35c6ab4a08 Ongoing work on node table
Sync commit: Does not compile
2025-09-30 10:06:19 +02:00
e58b0fa015 Add ui config tests and fix bugs 2025-09-30 09:01:54 +02:00
beb92967e5 Update nodestate API and db adapter 2025-09-28 08:26:44 +02:00
015583f1cd Add incremental configuration 2025-09-28 08:26:18 +02:00
d40c54b802 Refactor 2025-09-28 08:24:41 +02:00
647665b6b9 Refactor 2025-09-28 08:24:12 +02:00
4fc78bc382 Refactor variable namings and doc comments 2025-09-27 09:27:36 +02:00
50d000e7e2 Implement UI config handling 2025-09-27 09:26:42 +02:00
Jan Eitzinger
ad500c4bef Merge pull request #416 from ClusterCockpit/add_uiconfig_schema
Add uiconfig schema
2025-09-26 13:38:25 +02:00
Jan Eitzinger
916077c6f8 Merge branch 'dev' into add_uiconfig_schema 2025-09-26 13:27:18 +02:00
Christoph Kluge
935fb238a4 add init context to nodeOverview, add additional key for plot rerender 2025-09-10 18:01:33 +02:00
Christoph Kluge
d03e5b4562 handle metric disabled state explicitly in nodeOverview component 2025-09-10 15:42:13 +02:00
Christoph Kluge
05c45c6468 fix: add missing kes to node overview, solves load to empty overview 2025-09-10 15:31:39 +02:00
9020613a8b Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-09-10 15:14:40 +02:00
be92d5943d Decrease verbosity in jobcache sync task 2025-09-10 15:13:53 +02:00
Aditya Ujeniya
b2368a0751 Connectivity to CCMS feature readded 2025-09-10 14:23:18 +02:00
7948d5f773 Use different demo job-archive for dev branch 2025-09-10 10:34:11 +02:00
Jan Eitzinger
1a16851ad0 Merge pull request #401 from ClusterCockpit/dependabot
Add Dependabot for version updates
2025-09-10 09:18:13 +02:00
Jan Eitzinger
810c14a839 Merge pull request #405 from ClusterCockpit/metricstore
Metricstore Integration
2025-09-10 09:17:33 +02:00
Jan Eitzinger
df0e8eb228 Merge branch 'dev' into metricstore 2025-09-10 09:14:50 +02:00
79605c8a9e Update test pipeline to go 1.25 2025-09-10 09:08:32 +02:00
Aditya Ujeniya
9b644119ae Fix to testdata database 2025-09-09 18:34:10 +02:00
Christoph Kluge
ffa9919019 Merge pull request #403 from ClusterCockpit/rework_status_view
Rework status view

As discussed in office: Tests will be fixed in dev branch
2025-09-09 17:37:26 +02:00
55ca892f90 Merge branch 'metricstore' of github.com:ClusterCockpit/cc-backend into metricstore 2025-09-09 15:04:33 +02:00
eaca187032 Fix testdata for new schema 2025-09-09 15:04:25 +02:00
Aditya Ujeniya
3b9d05cc6d Fix exclusive to shared in svlete and graphql 2025-09-09 14:57:05 +02:00
d00881de2e Refactor and update dependencies 2025-09-09 11:36:02 +02:00
d8e85cf75d Fix migration 2025-09-09 11:35:34 +02:00
39f21763e4 Revert test database 2025-09-09 11:30:20 +02:00
Aditya Ujeniya
af43901ca3 Trial and Test MetricStore components 2025-09-08 22:54:13 +02:00
Aditya Ujeniya
62565b9ae2 Combined metricstore api and functions 2025-09-08 11:29:27 +02:00
Aditya Ujeniya
bca176170c Migration SQL fix 2025-09-03 08:22:15 +02:00
Christoph Kluge
2a91ca0cff Merge branch 'dev' into rework_status_view 2025-08-13 14:29:19 +02:00
Christoph Kluge
19a75554b0 remove outdated components 2025-08-13 14:23:19 +02:00
Christoph Kluge
58ae476a3e move and add interface options for status tabs 2025-08-13 14:22:24 +02:00
Christoph Kluge
44d8254a0b fix layouting 2025-08-12 17:57:04 +02:00
Christoph Kluge
bd2cdfcef2 reorganize plots, reduce tabs, 2025-08-12 17:04:31 +02:00
a50b832c2a Import metric store packages 2025-08-08 14:24:52 +02:00
Christoph Kluge
10194105e3 fix color overflow, add info if no status data 2025-08-08 13:50:09 +02:00
Christoph Kluge
b474288df7 add cbmode to piecharts
- old default colorscheme is now cb colorscheme
2025-08-07 18:20:34 +02:00
Christoph Kluge
f338209f32 rename new roofline compnent 2025-08-07 16:28:35 +02:00
Christoph Kluge
bef832e45b Build new statusDash, refine newRoofline data render 2025-08-07 16:10:11 +02:00
Christoph Kluge
71cfb4db77 fix: fix metric availability subcluster list overflow 2025-08-05 14:19:03 +02:00
86453e7e11 Port to new job structs
Backup commit: Does not build.
2025-08-05 10:23:54 +02:00
Christoph Kluge
98b9f8e62d Add more information to status dash 2025-08-04 14:50:53 +02:00
44cd8d258d Fix and regenerate Swagger and GraphQL 2025-07-31 12:10:46 +02:00
764b65d094 Add timestamp column to node table 2025-07-31 12:10:01 +02:00
Christoph Kluge
4d2c64b012 remove logging 2025-07-23 15:00:10 +02:00
Christoph Kluge
35c0b0be58 add scheduler and health status pie charts 2025-07-21 16:03:07 +02:00
Christoph Kluge
7a54e2cfb3 add required and minItems flags to uiConfigSchema 2025-07-21 11:37:05 +02:00
Christoph Kluge
54283f6d3c add schema definition for uiConfig 2025-07-21 11:21:54 +02:00
Christoph Kluge
697acd1d88 Extend bubbleRoofline for nodeData, add column to node table, rename nodeStats query 2025-07-18 18:12:07 +02:00
Christoph Kluge
5cdb80b4d6 cleanup intends, add transparency switch to path renderer 2025-07-15 18:49:23 +02:00
Christoph Kluge
e48ff8be73 change bubble render parameters
- Note: data points are hover highlighted by tooltip
2025-07-15 16:36:12 +02:00
Christoph Kluge
096217eea6 cleanup bubbleRoofline code, comment optional code parts 2025-07-15 16:00:55 +02:00
Christoph Kluge
ed5290be86 adds new roofline component for job average based data
- clickable, resource sized and duration colored bubbles
2025-07-14 18:12:34 +02:00
Christoph Kluge
b036c3903c add config fallbacks and notes 2025-07-10 14:57:12 +02:00
Christoph Kluge
57b43b7b60 Split status view into tabbed components 2025-07-07 18:44:24 +02:00
ab1ddb7bd1 Refactor 2025-07-07 14:29:06 +02:00
881f2f32f4 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-07-07 13:09:16 +02:00
0754ba5292 Port configuration to ccConfig scheme
Decentralize config validation
Modularize configuration handling
2025-07-07 13:09:12 +02:00
Christoph Kluge
743a89c3a2 Finalize node query backend functions, fix migration issue 2025-07-04 15:14:15 +02:00
Christoph Kluge
6692c3ab7c add indices for new node table and tags 2025-07-03 15:07:05 +02:00
Thomas Gruber
c16a5fdac4 Create dependabot.yml 2025-07-03 14:46:04 +02:00
Christoph Kluge
60ec7e54f5 Update component header, format, streamline SV5 components 2025-07-02 18:43:25 +02:00
dd48f5ab87 fix: Optimize sqlite settings 2025-07-02 09:12:07 +02:00
Christoph Kluge
db674ec31d Migrate RooflineHM and Scatter components
- With this commit, all SV4 components are migrated to SV5
2025-07-01 18:05:53 +02:00
Christoph Kluge
48150ffc8b Migrate Pie and Polar components 2025-07-01 17:25:52 +02:00
Christoph Kluge
1ad80efab6 Migrate Histogram and Roofline components 2025-07-01 16:33:07 +02:00
Christoph Kluge
aa8789f8f8 Migrate MetricPlot component 2025-07-01 15:50:45 +02:00
Christoph Kluge
56e3f2da5c Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-07-01 15:48:39 +02:00
Christoph Kluge
a4104822e2 format cleanup component arguments 2025-07-01 15:48:30 +02:00
Jan Eitzinger
c13f386e3b Merge pull request #399 from ClusterCockpit/port-to-cclib
Port to cclib
2025-06-30 13:22:22 +02:00
4bd73450b5 Temporary disable archive clean test 2025-06-30 13:00:07 +02:00
64da28e814 Merge branch 'dev' into port-to-cclib 2025-06-30 12:09:28 +02:00
639e1b9c6d Port to cc-lib. Extend legal header. 2025-06-30 12:06:35 +02:00
Christoph Kluge
63e828d2df Commentout dev logging 2025-06-27 18:49:19 +02:00
Christoph Kluge
b8c30b5703 Fix continuous scroll in sv5 joblist, rework joblist logic 2025-06-27 18:42:18 +02:00
Christoph Kluge
805ea91fc2 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-06-27 15:52:57 +02:00
Christoph Kluge
c4c422da57 Migrate jobList and jobListRow 2025-06-27 15:52:54 +02:00
544fb35121 Merge branch 'dev' into port-to-cclib 2025-06-27 14:15:38 +02:00
43edccb284 Add enable jobtagger options. Reformat. 2025-06-27 14:11:37 +02:00
7531ba4b5c Refine app detection
Switch to regexp
2025-06-27 14:11:10 +02:00
983aa592d8 refine highload rule 2025-06-27 12:16:17 +02:00
8378784231 Enclose terms by spaces in app detection 2025-06-27 12:16:06 +02:00
dca25cc601 Saveguard changes to archive 2025-06-27 12:15:42 +02:00
Christoph Kluge
c8fe81cd80 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-06-27 11:43:53 +02:00
c0a4724f57 Add matlab app type and fix tests 2025-06-27 11:31:43 +02:00
484c52d813 Also update job archive on tag deletion 2025-06-27 11:20:22 +02:00
Christoph Kluge
47843b2087 Optimize jobview gql query load 2025-06-27 11:15:17 +02:00
Christoph Kluge
c3a6126799 Migrate and rework job view metricplot wrapper 2025-06-26 18:41:27 +02:00
Christoph Kluge
e94b250541 Migrate nodeList subcomponents 2025-06-26 12:29:48 +02:00
Christoph Kluge
db5f6c7540 Migrate plotgrid, adapt parent components with new snippets 2025-06-25 18:19:24 +02:00
Christoph Kluge
79a6c9e90d Migrate Job View 2025-06-25 17:41:11 +02:00
e2e67e3977 Merge branch 'migrate_svelte5' into dev 2025-06-24 06:53:18 +02:00
6c06450701 Add more tagger rules 2025-06-24 06:52:21 +02:00
Christoph Kluge
d7379a1af2 Migrate jobView components 2025-06-20 18:14:36 +02:00
Christoph Kluge
d731611e0c Migrate single node view, fix route condition 2025-06-20 17:47:06 +02:00
Christoph Kluge
dceb92ba8e Migrate jobCompare and comparison plot 2025-06-20 15:20:26 +02:00
Christoph Kluge
1e039cb1bf Migrate select components and adapt parents 2025-06-18 18:14:56 +02:00
6f3e1ffbe3 Add ressource ounts to node table 2025-06-18 13:02:11 +02:00
Christoph Kluge
6a6dca3fce Migrate config, migrate analysis plotselection 2025-06-16 17:09:02 +02:00
Christoph Kluge
d6d92071bf fix: remove unnecessary bind, correct page item minimum 2025-06-16 13:04:33 +02:00
Christoph Kluge
d40657dc64 Migrate pagination and jobinfo 2025-06-13 17:05:07 +02:00
Christoph Kluge
6dde2a1e59 Migrate JobSummary and subcomponents 2025-06-13 15:49:51 +02:00
Christoph Kluge
b7823cec16 Migrate header components 2025-06-13 14:46:09 +02:00
Christoph Kluge
eabd7b8d51 Remove unused component 2025-06-13 14:40:07 +02:00
Christoph Kluge
27ec445e54 Small migrations and added migration note 2025-06-13 14:39:55 +02:00
Christoph Kluge
ad108b285f fix continuous scroll next page logic error 2025-06-12 17:20:22 +02:00
Christoph Kluge
f471214ef7 migrate system view, node list and node overview 2025-06-12 16:23:31 +02:00
Christoph Kluge
a0190f8f40 Merge branch 'dev' into migrate_svelte5 2025-06-10 10:02:58 +02:00
82af984023 Implement part of Node query GraphQL callbacks 2025-06-06 17:32:09 +02:00
0373010497 Refactor and fix tagger test 2025-06-06 16:41:48 +02:00
Christoph Kluge
c22d869aa7 Move form to cardbody instead of classing 2025-06-06 16:17:42 +02:00
87c93e90cd Implement node query 2025-06-06 16:04:53 +02:00
3d6dca9386 Add more apps for tagger 2025-06-06 16:04:37 +02:00
Christoph Kluge
f946e7e6ab fix: fix issues after updated dev branch merge 2025-06-06 13:43:13 +02:00
Christoph Kluge
d50dfa5867 Update frontend dependencies: rollup and svelte 2025-06-06 11:14:37 +02:00
249128e011 Cleanup. Re-generate Swagger 2025-06-06 06:30:40 +02:00
ca16a80b1f Add info logging to node repo 2025-06-06 06:12:02 +02:00
Christoph Kluge
e789e7ba9b fix missing state declarations 2025-06-05 18:08:16 +02:00
Christoph Kluge
5048f7be14 Merge branch 'dev' into migrate_svelte5 2025-06-05 17:56:48 +02:00
Christoph Kluge
0e3603f596 fix: layout issues in jobList toolbar 2025-06-05 17:47:03 +02:00
9cd4b3c1cc Convert to all lower case 2025-06-05 16:20:48 +02:00
1d9aa75960 Add determine nodestate routine 2025-06-05 16:15:40 +02:00
Christoph Kluge
0a24ef70e0 fix: fix joblist continuous scroll buildup when refreshing 2025-06-05 15:19:00 +02:00
3b5d3d671e Refactor 2025-06-05 14:27:26 +02:00
7db83d216e Start implementing nodestate rest api 2025-06-05 14:27:21 +02:00
d1a7002422 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-06-05 13:23:39 +02:00
1d8e7e072f Refactor rest api 2025-06-05 13:23:36 +02:00
7466fe7a34 Update GraphQL schema. Refactor node repository 2025-06-05 13:17:24 +02:00
Christoph Kluge
24cf5047da Migrate tags view 2025-06-05 10:51:32 +02:00
Christoph Kluge
1f103e5ef5 Migrate status view 2025-06-05 10:43:44 +02:00
Christoph Kluge
9e87974eb1 Fix compareTable sorting 2025-06-04 17:02:02 +02:00
Christoph Kluge
d806cf76c4 Fix config warning and key name 2025-06-04 16:23:54 +02:00
Christoph Kluge
6e2703998d Migrate jobTag management 2025-06-04 14:45:31 +02:00
6f9737c2c2 Add node repository, extend GraphQL API
Sync commit.
2025-06-04 13:44:37 +02:00
Christoph Kluge
5e696c10d5 Migrate job view stats table 2025-06-04 11:28:45 +02:00
Christoph Kluge
927e25c72c Migrate metricSelection 2025-06-03 13:32:14 +02:00
8b1b99ba35 feat: Add requested memory to job meta data
Fixes #110
2025-06-03 07:16:19 +02:00
2c102cd1ff Fix error in node table migration 2025-06-03 06:55:49 +02:00
Christoph Kluge
42c4926c47 fix refresher sv5 logic 2025-06-02 14:20:32 +02:00
Christoph Kluge
703556d893 Migrate user list and analysis view 2025-06-02 13:51:15 +02:00
Christoph Kluge
0b529a5c3c Migrate and fix filter component and subcomponents 2025-06-02 13:00:47 +02:00
Jan Eitzinger
5186b3f61e Merge pull request #398 from ClusterCockpit/Refactor-job-struct
Refactor job struct
2025-06-02 12:13:43 +02:00
4dc0da5099 Add node table schema 2025-06-02 12:07:01 +02:00
1bad6ba065 Regenerate GraphQL interface 2025-05-28 16:00:47 +02:00
3efee22536 Remove jobMeta and use job struct everywhere 2025-05-28 15:59:21 +02:00
eef48ac3a3 Small fix in highload rule 2025-05-28 14:33:52 +02:00
e35cfbc3dd Refactor 2025-05-28 14:32:56 +02:00
4a5fd96b32 Adapt job class rules 2025-05-28 14:32:49 +02:00
Jan Eitzinger
bdffe73f59 Merge pull request #397 from ClusterCockpit/134-job-tagging
134 job tagging
2025-05-27 13:14:50 +02:00
cdfe722457 Include metric thresholds in rule environment
Not yet tested
2025-05-27 13:02:13 +02:00
0aecea6de2 Refactor. Add Subcluster get metric list helper routine. 2025-05-27 09:23:28 +02:00
5a88c77171 Remove debug output 2025-05-26 14:42:41 +02:00
8003217092 Add string to gromacs app file 2025-05-26 14:41:02 +02:00
9b325041c1 Fix typo in jobCache columns 2025-05-26 14:30:30 +02:00
1e7fbe5d56 Refactor 2025-05-26 13:40:34 +02:00
0261c263f9 Add hint message only if rule matches 2025-05-26 13:36:23 +02:00
8d6ae85b0d Fix bug with job columns 2025-05-26 13:26:18 +02:00
f14bdb3068 Fix bugs in job classifier and tagger infrastructure 2025-05-26 13:08:03 +02:00
3c66840f95 Add tagger config option and command line switch to run taggers on all jobs 2025-05-23 10:13:59 +02:00
733e3ea9d5 Revert interface from jobMeta to job type. Extend job classifier tagger. Cleanup test rules. 2025-05-23 07:48:27 +02:00
ca634bb707 Refactor taggers. Refine Job Hooks. Start job classifier 2025-05-22 07:10:41 +02:00
9abc206d1a Read in tagger config on startup. Safeguard watcher shutdown 2025-05-20 07:10:15 +02:00
85f17c0fd8 Refactor Tagger package. Add fsNotify Service 2025-05-19 16:08:43 +02:00
14bad81b9f Extend Job Hooks and add unit tests
Add job tagger control
2025-05-19 13:25:39 +02:00
Christoph Kluge
ffd596e2c7 Migrate job list view and filter components
- filters now inactive in user jobs, lists and analysis due to missing dispatch
2025-05-19 09:25:23 +02:00
99f8187092 Port tests to new architecture 2025-05-19 09:17:16 +02:00
f30b784f45 Attempt to fix api test
Tests still fail
2025-05-16 17:38:00 +02:00
f06b5f8fc0 Refactor 2025-05-16 17:37:36 +02:00
2e781b900d Staged error handling for job cache 2025-05-16 17:37:24 +02:00
d76b1ae75d feat: add job commit service
Sync jobs from job cache table to main job table.
Enables #392
2025-05-16 17:36:33 +02:00
40110580e0 feat: add job hook support
Fixes #394
2025-05-16 17:33:44 +02:00
eab7961a83 Introduce caching table for faster job inserts
Fixes #392
2025-05-16 17:32:19 +02:00
432e06e801 Add GoString method for jobmeta 2025-05-16 17:19:56 +02:00
fe1ff5c7a3 Update tests from dev 2025-05-16 07:33:33 +02:00
6e66b8e08b Merge branch 'dev' into 134-job-tagging 2025-05-16 07:26:00 +02:00
7abdd0545e Add api for tag handling within cc-backend 2025-05-16 07:24:24 +02:00
Christoph Kluge
3f1768e467 Merge branch 'dev' into migrate_svelte5 2025-05-14 17:06:30 +02:00
Christoph Kluge
f464921ae3 fix: fix user view filter job count 2025-05-14 17:05:58 +02:00
Christoph Kluge
7603ad3fb0 Polish and Format rollup config for svelte5 2025-05-14 11:41:11 +02:00
Christoph Kluge
be7ccc78b8 Update packages, ignore sveltestrap related warnings on compile 2025-05-14 11:02:48 +02:00
Christoph Kluge
b3135c982f Merge latest state branch 'dev' into migrate_svelte5 2025-05-13 18:25:54 +02:00
13386175f5 Merge branch 'dev' into 134-job-tagging 2025-05-13 14:48:58 +02:00
23e8f3dc2d Port to godotenv library
Fixes #376
2025-05-13 14:46:01 +02:00
Jan Eitzinger
b323ce2eef Merge pull request #391 from ClusterCockpit/add_job_comparison
Add job comparison
2025-05-13 14:18:22 +02:00
Jan Eitzinger
08e323ba51 Merge pull request #390 from ClusterCockpit/dependabot/go_modules/golang.org/x/net-0.38.0
Bump golang.org/x/net from 0.36.0 to 0.38.0
2025-05-13 14:12:44 +02:00
dependabot[bot]
9f50f36b1d Bump golang.org/x/net from 0.36.0 to 0.38.0
Bumps [golang.org/x/net](https://github.com/golang/net) from 0.36.0 to 0.38.0.
- [Commits](https://github.com/golang/net/compare/v0.36.0...v0.38.0)

---
updated-dependencies:
- dependency-name: golang.org/x/net
  dependency-version: 0.38.0
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-05-13 12:10:40 +00:00
Christoph Kluge
4399c1d590 Add metric units to compareTable head, format metric subheads 2025-05-12 11:39:45 +02:00
Christoph Kluge
f7376f6dca Reduce tick spacing in comparePlots 2025-05-09 17:56:20 +02:00
Christoph Kluge
518cb34340 Add sorting to compareTable 2025-05-09 17:07:39 +02:00
Christoph Kluge
f210a5f508 Remove refresh and textfilter from compareView, force id filters only on compare 2025-05-09 14:36:54 +02:00
Christoph Kluge
9ebc49dd1c add table to compareview, remove debug data view 2025-05-08 15:21:05 +02:00
Christoph Kluge
c119eeb468 Prevent high job counts in compare view by filter removal 2025-05-08 11:28:13 +02:00
Christoph Kluge
ab616f8f79 Fix JobCompare Labelling and Rerender 2025-05-08 10:48:30 +02:00
Christoph Kluge
69286881e4 add manual job selection for comparison in jobs view 2025-05-08 09:28:48 +02:00
Christoph Kluge
4419df8d1b add cluster and subcluster information to compareplot 2025-05-06 18:08:35 +02:00
Christoph Kluge
aed2bd48fc add resource compare graph, add cursor sync, handle jobIds fitler 2025-05-06 17:54:13 +02:00
Christoph Kluge
d3d752f90c finalize compareplot prototype, move formattime to units.js 2025-05-06 10:46:30 +02:00
Christoph Kluge
33ecfe88ef add job duration, add starttime and duration to legend 2025-05-06 09:58:28 +02:00
Christoph Kluge
fd52fdd35b add job starttime to legend 2025-05-05 16:41:05 +02:00
Christoph Kluge
1d13d3dccf add and integrate job comparison plot component 2025-05-05 11:26:39 +02:00
Christoph Kluge
1c84bcae35 add filterBuffer for seamless view switch 2025-04-29 18:40:44 +02:00
Christoph Kluge
df497d5952 initial branch commit, add job compare switch, add gql resolver 2025-04-29 15:10:06 +02:00
Jan Eitzinger
f65e122f8d Merge pull request #386 from ClusterCockpit/hotfix
Prepare re-release for v1.4.4
2025-04-28 10:18:44 +02:00
161f0744aa fix: enforce apiAllowedIPs config option
Fixes #385
2025-04-28 09:54:22 +02:00
95de9ad3b3 Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix 2025-04-28 08:52:27 +02:00
Jan Eitzinger
d5c170055f Merge pull request #384 from brinkcoder/fix/auth-log-iperr
[BUGFIX] correct wrong variable in AuthApi error logging
2025-04-28 08:51:42 +02:00
brinkcoder
61f0521072 fix: correct logging variable from err to ipErr in AuthApi 2025-04-25 22:37:16 +02:00
Christoph Kluge
6ca14c55f2 fix: fix error in jobsMetricStatisticsHistogram calculation
- also reduces overhead, simplifies query
2025-04-25 18:09:21 +02:00
Jan Eitzinger
1309d09aee Merge pull request #383 from ClusterCockpit/hotfix
Remove websocket sse GraphQL support
2025-04-24 12:59:34 +02:00
aba75b3a19 Remove websocket sse GraphQL support 2025-04-24 12:57:37 +02:00
Jan Eitzinger
e87481d8db Merge pull request #382 from ClusterCockpit/hotfix
Prepare Bugfix Release 1.4.4
2025-04-24 11:46:25 +02:00
acaad69917 Prepare Bugfix Release 1.4.4 2025-04-24 11:42:34 +02:00
Jan Eitzinger
ff588ad57a Merge pull request #381 from ClusterCockpit/dev
Dev
2025-04-24 11:18:55 +02:00
65df27154c Cleanup and regenerate Swagger docs 2025-04-24 11:14:51 +02:00
8dfa1957f4 Merge hotfix changes 2025-04-24 11:07:02 +02:00
570eba3794 Cleanup Swagger docs 2025-04-24 11:01:13 +02:00
94a39fc61f Readd tag endpoints 2025-04-24 10:53:55 +02:00
2d359e5f99 Merge rest.go 2025-04-24 10:40:03 +02:00
Jan Eitzinger
04692e0c44 Merge pull request #379 from ClusterCockpit/add_tag_delete
Add Tag Deletion: API and Frontend
2025-04-24 10:09:51 +02:00
Jan Eitzinger
809fd23b88 Merge pull request #380 from ClusterCockpit/review_api_auth
Review api auth
2025-04-24 10:08:18 +02:00
Christoph Kluge
e3653daea3 reduce code in tag svelte view 2025-04-23 17:59:26 +02:00
Christoph Kluge
48fa75386c feat: add tag removal api endpoints 2025-04-23 16:12:56 +02:00
Christoph Kluge
1b3a12a4dc feat: add remove functionality to tag view, add confirm alert 2025-04-23 15:01:12 +02:00
Christoph Kluge
543ddf540e implement removeTagFromList mutation, add tag mutation access checks 2025-04-23 14:51:01 +02:00
Christoph Kluge
a3fb471546 adapt and improve svelte taglist component 2025-04-22 17:33:17 +02:00
Christoph Kluge
277f964b30 move taglist a from go tmpl to svelte component 2025-04-22 13:47:25 +02:00
Christoph Kluge
9bcf7adb67 add api calls for removing tags, initial branch commit 2025-04-17 17:31:59 +02:00
Christoph Kluge
f343fa0071 fix: add name scrambling demo mode to all views
- was missing for analysis, status and nodelist
2025-04-17 11:15:35 +02:00
Christoph Kluge
e5862e9218 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-04-16 18:36:15 +02:00
Christoph Kluge
29ae2423f8 fix metricconfig pointer copy, add disabled metric card in jobView
- skips disabled metrics in backend, see cc-backend tries to retrieve "removed" metrics #377
2025-04-16 18:36:12 +02:00
Christoph Kluge
1755a4a7df remove separate userapiallowedips config and check 2025-04-14 11:58:42 +02:00
Christoph Kluge
25d3325049 add getUsers to admin REST api 2025-04-14 11:36:03 +02:00
Christoph Kluge
fb6a4c3b87 review and move api endpoints secured check 2025-04-09 16:00:27 +02:00
317f80a984 fix: Replace deprecated gqlgen NewDefaultServer call 2025-04-09 09:40:52 +02:00
28cdc1d9e5 fix: Update endpoints in Swagger UI 2025-04-09 09:13:21 +02:00
c2087b15d5 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-04-09 07:28:02 +02:00
a8d785beb3 Remove redundant check in auth package 2025-04-09 07:27:59 +02:00
Christoph Kluge
a6784b5549 fix: reintroduce statstable id natural sort order
- see Use natural sort order for IDs in statistics tables #369
2025-04-08 16:00:07 +02:00
Christoph Kluge
d770292be8 feat: add nodename matcher select to filter, defaults to equal match
- see PR !353
2025-04-08 14:52:07 +02:00
Christoph Kluge
b3a1037ade Merge pull request #353 from brinkcoder/fix-node-filter
Fix node filter to use EXISTS for exact hostname matches
2025-04-08 12:57:04 +02:00
Christoph Kluge
02946cf0b4 fix: fix nodelist filter result displaying wrong information
- missing svelte iteration key added
2025-04-07 17:03:23 +02:00
Christoph Kluge
cf051d5108 Merge pull request #375 from ClusterCockpit/master
Dependabot Update Dev Branch
2025-04-07 16:09:31 +02:00
Christoph Kluge
96977c6183 Merge pull request #374 from ClusterCockpit/review_logging
Review logging
2025-04-07 16:03:48 +02:00
Jan Eitzinger
73d83164fc Merge pull request #373 from ClusterCockpit/dependabot/go_modules/golang.org/x/net-0.36.0
Bump golang.org/x/net from 0.35.0 to 0.36.0
2025-04-04 11:05:01 +02:00
dependabot[bot]
1064f5e4a8 Bump golang.org/x/net from 0.35.0 to 0.36.0
Bumps [golang.org/x/net](https://github.com/golang/net) from 0.35.0 to 0.36.0.
- [Commits](https://github.com/golang/net/compare/v0.35.0...v0.36.0)

---
updated-dependencies:
- dependency-name: golang.org/x/net
  dependency-version: 0.36.0
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-04-04 09:01:59 +00:00
Jan Eitzinger
5be98c7087 Merge pull request #372 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/babel/runtime-7.27.0
Bump @babel/runtime from 7.26.0 to 7.27.0 in /web/frontend
2025-04-04 10:55:34 +02:00
dependabot[bot]
0d689c7dff Bump @babel/runtime from 7.26.0 to 7.27.0 in /web/frontend
Bumps [@babel/runtime](https://github.com/babel/babel/tree/HEAD/packages/babel-runtime) from 7.26.0 to 7.27.0.
- [Release notes](https://github.com/babel/babel/releases)
- [Changelog](https://github.com/babel/babel/blob/main/CHANGELOG.md)
- [Commits](https://github.com/babel/babel/commits/v7.27.0/packages/babel-runtime)

---
updated-dependencies:
- dependency-name: "@babel/runtime"
  dependency-version: 7.27.0
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-04-04 08:45:16 +00:00
Jan Eitzinger
1f24ed46a0 Merge pull request #371 from ClusterCockpit/dependabot/go_modules/github.com/golang-jwt/jwt/v5-5.2.2
Bump github.com/golang-jwt/jwt/v5 from 5.2.1 to 5.2.2
2025-04-04 10:37:18 +02:00
dependabot[bot]
92b4159f9e Bump github.com/golang-jwt/jwt/v5 from 5.2.1 to 5.2.2
Bumps [github.com/golang-jwt/jwt/v5](https://github.com/golang-jwt/jwt) from 5.2.1 to 5.2.2.
- [Release notes](https://github.com/golang-jwt/jwt/releases)
- [Changelog](https://github.com/golang-jwt/jwt/blob/main/VERSION_HISTORY.md)
- [Commits](https://github.com/golang-jwt/jwt/compare/v5.2.1...v5.2.2)

---
updated-dependencies:
- dependency-name: github.com/golang-jwt/jwt/v5
  dependency-version: 5.2.2
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-04-04 08:35:15 +00:00
Jan Eitzinger
5817b41e29 Merge pull request #368 from ClusterCockpit/dev
Dev
2025-03-20 13:02:23 +01:00
d6b132e3a6 Merge branch 'master' into dev 2025-03-20 12:51:23 +01:00
Jan Eitzinger
318f70f34c Merge pull request #365 from ClusterCockpit/split_statsTable_query
Split StatsTable DataQuery from JobMetrics Query In Job-View
2025-03-20 12:50:23 +01:00
Jan Eitzinger
e41525d40a Merge pull request #366 from ClusterCockpit/hotfix
fix: always return hasNextPage boolean to frontend
2025-03-20 12:49:57 +01:00
Jan Eitzinger
a102220e52 Merge pull request #367 from ClusterCockpit/makefile-fix
Fix 'make -B', don't fail if $(VAR) already exists
2025-03-20 12:47:16 +01:00
Christoph Kluge
e9a214c5b2 fix: add nullSafe condition to monitoringStatus display on metric queryError 2025-03-19 14:57:27 +01:00
Christoph Kluge
c53f5eb144 fix: always return hasNextPage boolean to frontend
- removes dependency on uiDefaults setting
2025-03-18 18:01:37 +01:00
Christoph Kluge
9ed64e0388 Review logging, comment cleanup 2025-03-17 17:39:17 +01:00
Christoph Kluge
93040d4629 IMplement LoadNode Data, LoadNodeListData, LoadScopedStats for influxDB2 backend
- Untested
- Only Node Scope
2025-03-17 15:25:33 +01:00
Christoph Kluge
0144ad43f5 Implement NodeListData and ScopedStats for Prometheus Backend 2025-03-17 11:03:51 +01:00
Christoph Kluge
8da2fc30c3 split statsTable data from jobMetrics query, frontend refactor 2025-03-14 16:36:31 +01:00
0e27ae7795 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-03-14 10:52:39 +01:00
33c6cdb9fe Update test workflow 2025-03-14 10:52:27 +01:00
Jan Eitzinger
73b7014469 Merge pull request #355 from ClusterCockpit/hotfix
Prepare Release 1.4.3
2025-03-14 10:09:48 +01:00
25aaf55b93 Add feature to Releasenotes 2025-03-14 10:06:25 +01:00
6a7546c43b Clarify header for breaking changes 2025-03-14 10:03:53 +01:00
0adda4bf7b Merge branch 'master' into hotfix 2025-03-14 10:00:29 +01:00
Christoph Kluge
f5f36427a4 split statsTable data from jobMetrics query, initial commit
- mainly backend changes
- statstable changes only for prototyping
2025-03-13 17:33:55 +01:00
Jan Eitzinger
590bfd3a10 Merge pull request #354 from ClusterCockpit/dev
Dev
2025-03-13 14:22:44 +01:00
exterr2f
16db9bd1a2 Fix node filter: Use EXISTS with Eq for exact match and LIKE for Contains 2025-03-11 12:20:13 +01:00
Christoph Kluge
d0af933b35 feat: add subCluster level frontend keys for metric selections
- applies to jobView and nodeList
2025-03-06 15:39:15 +01:00
Christoph Kluge
2b56b40e6d Review energyFootprint calculation, fix missing numNodes factor, add log 2025-03-06 12:46:25 +01:00
Michael Panzlaff
4b2d7068b3 Revert "add node_fail job state"
This reverts commit 6454576417.
2025-03-04 18:16:02 +01:00
Michael Panzlaff
bd93b8be8e Revert "add node_fail state to database schema"
This reverts commit 65d2698af4.
2025-03-04 18:15:53 +01:00
Michael Panzlaff
aa3fe2b872 Revert "add missing node_fail to db constraints"
This reverts commit d4336b0dcb.
2025-03-04 18:15:46 +01:00
Michael Panzlaff
a61ff915ac Revert "add more missing node_fail states"
This reverts commit 0a3e678329.
2025-03-04 18:15:39 +01:00
Michael Panzlaff
0a3e678329 add more missing node_fail states 2025-03-04 18:03:01 +01:00
Michael Panzlaff
d4336b0dcb add missing node_fail to db constraints 2025-03-04 18:00:02 +01:00
Michael Panzlaff
65d2698af4 add node_fail state to database schema 2025-03-04 17:47:49 +01:00
Michael Panzlaff
6454576417 add node_fail job state 2025-03-04 17:42:09 +01:00
Michael Panzlaff
a485bd5977 allow /start_job/ with 0 second duration
Apparently it is possible to get this for very short jobs.
2025-03-04 14:09:04 +01:00
Christoph Kluge
e733688fd0 add new subCluster prop to statsTable metric select 2025-03-03 17:54:34 +01:00
Christoph Kluge
e86f6a8cbd Merge pull request #351 from ClusterCockpit/dev
Dev
2025-03-03 17:34:33 +01:00
Christoph Kluge
fcc9e17664 change: remove metrics from job view select if unavailable on subCLuster 2025-03-03 17:24:54 +01:00
Christoph Kluge
5c9d4ffa9a clarify and simplyfy earlier change 2025-03-03 17:00:33 +01:00
Christoph Kluge
419bc2747b fix nodeInfo null error 2025-03-03 16:53:19 +01:00
Christoph Kluge
1ee99d6866 Merge pull request #348 from ClusterCockpit/dev
use extendedLegend in nodeList for all non-idle nodes
2025-03-03 12:48:26 +01:00
Christoph Kluge
3ab8973895 use extendedLegend in nodeList for all non-idle nodes
- changed from "use for shared nodes only"
2025-03-03 12:44:18 +01:00
Jan Eitzinger
acfa3baeb5 Merge pull request #347 from ClusterCockpit/dev
Dev
2025-03-03 11:34:54 +01:00
Christoph Kluge
c21d7cf101 fix and review quick starttime select handling 2025-03-03 11:21:54 +01:00
Christoph Kluge
ec895e1d9e Add fallback case to nodeInfo 2025-03-03 09:36:37 +01:00
Christoph Kluge
c964f09a4f Merge branch 'dev' into review_logging 2025-02-28 17:19:00 +01:00
Christoph Kluge
0bc32f27df Merge branch 'dev' into migrate_svelte5 2025-02-28 17:18:30 +01:00
Christoph Kluge
6640e93ce9 edit new features for 1.4.3 releasenotes 2025-02-28 15:12:42 +01:00
Christoph Kluge
d7aefe0cf0 move user names in top lists to tooltip 2025-02-28 14:55:32 +01:00
Christoph Kluge
187fe5b361 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-02-28 14:40:29 +01:00
Christoph Kluge
b31aea7bc5 revert back to using globalMetrics in jobView metric default select 2025-02-28 14:40:27 +01:00
c661baf058 Load new default metrics config from working directory 2025-02-28 14:36:19 +01:00
Christoph Kluge
0fe0461340 remove conflicting variable layer in metric histo select 2025-02-28 14:00:27 +01:00
Christoph Kluge
d5394c9e92 fix: analysis view top links fixed, add full name to topusers 2025-02-28 13:37:59 +01:00
Christoph Kluge
42135fd26c if disableClusterSelection is set, display info in cluster filter
- instead of undocumented unresponsive cluster name select
2025-02-28 13:37:28 +01:00
Christoph Kluge
38569f55c7 add title to roofline plot
- Clarify that roofline is CPU only
2025-02-28 13:09:04 +01:00
Christoph Kluge
5ce03c2db3 add metric selection count info to job view 2025-02-28 13:08:32 +01:00
Christoph Kluge
1031b3eb79 fix: user and status view histogram selection
- correctly loads selection for selected cluster
- applies availablility for selected cluster
2025-02-28 13:06:40 +01:00
Jan Eitzinger
fcdf4cd476 Bugfixes
Dev
2025-02-28 10:15:16 +01:00
6268dffff8 Readd time pkg after fixing merge conflict 2025-02-28 09:20:05 +01:00
c10737bfd7 Merge branch 'master' into dev 2025-02-28 09:17:39 +01:00
Christoph Kluge
bd0cc69668 Review fatalf log calls and messages 2025-02-27 18:10:04 +01:00
Christoph Kluge
84fffac264 Merge branch 'dev' into review_logging 2025-02-27 15:20:46 +01:00
Christoph Kluge
5bf968010e Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-02-27 14:51:34 +01:00
Christoph Kluge
61bc095d01 fix: decouple polarPlot data query, add new dedicated gql endpoint
- includes go package upgrades
- includes gqlgen error workaround
2025-02-27 14:51:31 +01:00
Michael Panzlaff
e376f97547 make swagger 2025-02-27 14:42:18 +01:00
Michael Panzlaff
f2428d3cb3 /jobs/stop_job/ change bad job_state to HTTP 422 2025-02-27 14:20:18 +01:00
Michael Panzlaff
2fdac85d31 fix: Do not allow to start a job with a state != running 2025-02-27 14:04:54 +01:00
Christoph Kluge
b731395689 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-02-26 15:21:00 +01:00
Christoph Kluge
07405e3466 fix: add missing exclusive filter handler for jobQuery 2025-02-26 15:20:58 +01:00
Christoph Kluge
fc0c76bd77 Apply new log funtion to init and main, review or add logtexts 2025-02-26 15:20:25 +01:00
Christoph Kluge
d209547968 Remove dedicated fatal loglevel, change to Fprintln for unformatted 2025-02-26 14:40:54 +01:00
632b9fc5ea Prepare Bugfix release 1.4.3 2025-02-26 12:54:50 +01:00
702591b4ec Merge branch 'master' into hotfix 2025-02-26 11:42:01 +01:00
Jan Eitzinger
c562746e5f Merge pull request #343 from ClusterCockpit/dev
Changes for Release v1.4.3
2025-02-25 13:09:54 +01:00
Jan Eitzinger
c0443cbec2 Merge pull request #334 from ClusterCockpit/add_statsRounding_dataLoader
Add stats rounding data loader
2025-02-25 13:07:03 +01:00
Christoph Kluge
0191bc3821 Annotate and review log functions, add stdout writers 2025-02-25 10:21:48 +01:00
Christoph Kluge
633bd42036 Add rounding to archiver avg stat calculation 2025-02-24 11:10:45 +01:00
Christoph Kluge
998ef8d834 fix: use job_view_selectedMetrics config instead of iterating globalMetrics
- Caveat: Minimal Defaultset needs to be generally available on all clusters
2025-02-19 16:40:25 +01:00
Christoph Kluge
c25b076ca9 fix: separate polar plot metric list from job.footprint return 2025-02-19 16:38:35 +01:00
Christoph Kluge
f43379f365 fix: add missing factor for job power calculation, see #340 2025-02-18 18:10:39 +01:00
Christoph Kluge
d902c0acf4 Merge branch 'master' into hotfix 2025-02-18 09:35:02 +01:00
Christoph Kluge
58e678d72c fix: load jobView roofline on finest resolution separately by default, see #339 2025-02-17 18:24:28 +01:00
Jan Eitzinger
cbc49669d0 Merge pull request #333 from brinkcoder/default-metrics-config
Add config for default of job_view_selectedMetrics:<cluster_name>
2025-02-17 11:23:02 +01:00
Jan Eitzinger
78bb638fd6 Merge pull request #338 from brinkcoder/add-login-ratelimiting
Add Rate Limiting for Login
2025-02-17 11:03:18 +01:00
exterr2f
7a61bae471 clarify error message for blocked user 2025-02-17 09:17:27 +01:00
exterr2f
e1b992526e Improve rate limiting to combination of IP and username 2025-02-14 20:20:42 +01:00
Christoph Kluge
1b043838ea fix: fix svelte js race condition on metric selection change, see #335
- only dispatch new data to statsTable on 'load-all'
2025-02-14 16:18:58 +01:00
Christoph Kluge
07e72294dc Merge branch 'master' into hotfix 2025-02-14 14:29:33 +01:00
exterr2f
b6b37ee68b Add Rate Limiting based on IP and username 2025-02-14 12:41:28 +01:00
exterr2f
43cb1f1bff Fix SessionMaxAge condition to correctly apply valid values 2025-02-14 11:44:46 +01:00
Michael Panzlaff
f7a67c72bf fix 'unhandled case' error for core metrics 2025-02-13 17:34:45 +01:00
Christoph Kluge
c5476d08fa amend polar frontend logs 2025-02-12 10:11:54 +01:00
Christoph Kluge
8af92b1557 simplify polar plot data code, add scaling for shared jobs to polar 2025-02-11 18:38:48 +01:00
Christoph Kluge
eaa826bb8a adds centralized rounding of series stats in dataLoader
- Fixed to two digit precision
2025-02-11 10:57:04 +01:00
Christoph Kluge
140b3c371d fix undefined if system_view_selectedMetric missing
- defaults to first metric on init
- reorder gitignore
2025-02-10 15:36:14 +01:00
exterr2f
f158eaa29c Add default_metrics.json which sets the defaults for job_view_selectedMetrics:cluster for new users 2025-02-10 09:39:49 +01:00
Christoph Kluge
c4b98ade53 increase user table height, add but disable autocomplete attribute
- missing autocomplete attribute was logged as warning in chrome console
2025-02-05 15:18:42 +01:00
Christoph Kluge
f2e85306ca fix wrong label ids in options view
- allowed setting wrong field
2025-02-05 12:58:51 +01:00
Christoph Kluge
42b9de8360 add canvasId default, fix analysis view pie props 2025-02-05 12:51:06 +01:00
Christoph Kluge
6c244f3121 renderodelist spinner info only for continuous scroll 2025-02-04 18:41:10 +01:00
Christoph Kluge
9f56213d2f fix list view sorting of string fields 2025-02-04 17:52:11 +01:00
Christoph Kluge
fb2f7cf680 fix dirty vars on textfilter reset 2025-02-04 13:29:09 +01:00
Christoph Kluge
8fcdd24f84 Second onclick pass 2025-02-04 12:52:56 +01:00
Christoph Kluge
aaafde4a7c add function syntax to sveltestrap onclick events and others
- fixes event_handler_invalid svelte warning and blockage
2025-02-04 12:13:06 +01:00
Christoph Kluge
2b23003556 fix metric selection drag and drop 2025-02-03 19:36:28 +01:00
Christoph Kluge
5681062f01 Initial migration to Svelte5 via full syntax compatability
- updated all dependencies
- removed svelte-chartjs wrapper from dependencies
- sveltestrap causes compilation warnings (once)
- Header.svelte uses new Svelte5 syntax as example
- fixed most initial compilation warnings except circular dependencies with TBD cause
2025-02-03 17:31:01 +01:00
Michael Panzlaff
d61bf212f5 Fix 'make -B', don't fail if $(VAR) already exists 2025-02-03 17:02:13 +01:00
Michael Panzlaff
2bd7c8d51e Fix 'make -B'
Do not raise an error, if the directory already exists.
2025-02-03 16:52:50 +01:00
Christoph Kluge
1e63cdbcda fix: remove caching for footprint db field
- footprints before first worker run are cached as empty, and are permanently returned as such until app restart
- fixes Polar plot for running jobs #328
2025-01-30 17:14:17 +01:00
Jan Eitzinger
86d85f12be Merge pull request #330 from ClusterCockpit/dev
Fix crash if no data on metric-store side
2025-01-30 12:30:25 +01:00
Christoph Kluge
dd470d49ec fix potential crash cause due to index oor on empty ccms return 2025-01-30 11:55:40 +01:00
Christoph Kluge
95d8062b00 fix Generate JWT as user doesn't work #327 2025-01-30 11:10:50 +01:00
Christoph Kluge
8f82399214 Merge branch 'master' into hotfix 2025-01-30 10:36:33 +01:00
Jan Eitzinger
6247150e9c Merge pull request #326 from ClusterCockpit/dev
Dev
2025-01-28 14:23:15 +01:00
5266644725 Merge branch 'master' into dev 2025-01-28 14:21:16 +01:00
81d9e96552 Upgrade golang version 2025-01-28 14:17:28 +01:00
Jan Eitzinger
4ec9f06114 Merge pull request #325 from ClusterCockpit/add_detailed_nodelist
Add detailed nodelist
2025-01-28 13:53:57 +01:00
0033e9f6c0 Regenerate GraphQL adapter 2025-01-28 13:51:12 +01:00
571652c314 Merge branch 'dev' into add_detailed_nodelist 2025-01-28 13:47:22 +01:00
Jan Eitzinger
7ec233e18a Merge pull request #324 from ClusterCockpit/317_add_colorblindmode
add colorblind setting
2025-01-28 13:38:40 +01:00
Jan Eitzinger
13c9a12336 Merge pull request #323 from ClusterCockpit/add_histogram_bin_select
Add histogram bin select
2025-01-28 13:38:15 +01:00
Christoph Kluge
83d472ecd6 Fix duration histograms in status and analysis view, use defaults 2025-01-27 12:35:42 +01:00
Christoph Kluge
c21da6512a fix rerender by keys, disable resolution select if no resampling active 2025-01-24 16:17:51 +01:00
Christoph Kluge
4b4374e0df Merge pull request #322 from ClusterCockpit/hotfix
Update Dev Branch with Hotfix
2025-01-24 14:35:28 +01:00
Christoph Kluge
407276a04d Merge pull request #321 from ClusterCockpit/fix-go-version
fix: Fix go version in go.mod
2025-01-24 14:33:32 +01:00
Christoph Kluge
64f60905b4 Drop change on zoom for selector with options
- Up to 7 days worth of runtime
- No zoomState issue and cached results
2025-01-24 14:06:43 +01:00
Christoph Kluge
9e6072fed2 Add 25th hour for runtime bracket 24-25h
- zoomstate does not work
- maxbins too hardcoded for runtimes > 25 hours
2025-01-24 10:39:33 +01:00
Christoph Kluge
a3e5c424fd add zoom in metric histograms for running and completed states
- keeping last zoomstate doe snot work
2025-01-23 17:48:45 +01:00
Christoph Kluge
6683a350aa initial duration histogram zoom in frontend
- metric zoom todo
- keeping last zoomState does not work
2025-01-23 12:23:29 +01:00
Christoph Kluge
05bfa9b546 Prepare adaptive binCounts in backend 2025-01-22 12:07:12 +01:00
Christoph Kluge
735988decb add extended legend for nodelist acc metrics, move nodelist paging select 2025-01-21 18:35:03 +01:00
Christoph Kluge
d0580592be include feedback on nodeListView
- display names of users and projects
- stacked metricPlot for statsSeries
2025-01-17 13:13:00 +01:00
Christoph Kluge
817076bdbf initial prototyping 2025-01-16 12:25:49 +01:00
Christoph Kluge
736236e9ca add colorblind setting and friendly palettes
- mode applies to plot data, plot background color, statsseries colors, roofline timescale
2025-01-14 17:40:25 +01:00
Michael Panzlaff
3f4114c51b fix: Fix go version in go.mod
If the local go version is not up to date, go was previously unable to
obtain a more recent version, since the required version in go.mod is
not available.
2025-01-14 10:34:16 +01:00
Christoph Kluge
5c2c493c56 edit page titles, fix nodeList continuous parameter changes 2025-01-13 14:00:10 +01:00
Christoph Kluge
2c383ebea1 add independent config keys for nodeList 2025-01-13 11:46:17 +01:00
Christoph Kluge
91e73450cf fix error on metric selection change if continuous 2025-01-13 11:13:06 +01:00
Christoph Kluge
e55798944e add navbar select, add continous scroll, paging persistance 2025-01-10 18:02:54 +01:00
Christoph Kluge
5ea11a5ad2 fix legends, add resolution, add statsseries, add simple healthcheck 2025-01-10 16:06:29 +01:00
Christoph Kluge
2a3383e9e6 add scopes, paging and backend filtering to nodeList 2025-01-09 18:56:50 +01:00
Christoph Kluge
e871703724 add dedicated nodeListData handling to metricData interface 2025-01-09 11:18:04 +01:00
Christoph Kluge
1ee367d7be Merge branch 'hotfix' into add_detailed_nodelist 2025-01-07 14:07:41 +01:00
Christoph Kluge
bce536b9b4 fix: add missing parameters for correct shared metric thresholds 2024-12-20 17:15:02 +01:00
Christoph Kluge
7c9182e0b0 move shared routine to utils.js, remove flop peak reduction 2024-12-20 17:14:12 +01:00
Christoph Kluge
aa915d639d feat: add deselect all button to jobStatefilter 2024-12-20 13:02:21 +01:00
Jan Eitzinger
9489ebc7d6 Merge pull request #320 from ClusterCockpit/hotfix
Fixes for Bugfix Release 1.4.2
2024-12-19 14:51:07 +01:00
2a5c525193 Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix 2024-12-19 11:12:50 +01:00
9e2d981c60 Add notice about footprint to ReleaseNotes 2024-12-19 11:12:40 +01:00
Christoph Kluge
53dfe9e4f5 fix: footprint peak is default if footprint stat is avg 2024-12-19 11:00:12 +01:00
48e95fbdb0 Prepare release 1.4.2 2024-12-19 06:34:35 +01:00
fd94d85edf Compute duration for running jobs on the fly 2024-12-19 06:24:08 +01:00
f2d1a85afb Reformat json schema files 2024-12-19 06:14:35 +01:00
0bdbcb8bab Use persisted duration for running jobs
Fixes #318
2024-12-19 05:55:31 +01:00
Christoph Kluge
7b91a819be add workaround for clipboard button 2024-12-18 16:40:49 +01:00
bc89025924 Revert to blocking startJob REST api
Fixes #316
2024-12-18 11:45:56 +01:00
Jan Eitzinger
16bcaef4c3 Merge pull request #319 from ClusterCockpit/dependabot/go_modules/golang.org/x/crypto-0.31.0
Bump golang.org/x/crypto from 0.29.0 to 0.31.0
2024-12-18 07:27:19 +01:00
dependabot[bot]
fcbfa451f2 Bump golang.org/x/crypto from 0.29.0 to 0.31.0
Bumps [golang.org/x/crypto](https://github.com/golang/crypto) from 0.29.0 to 0.31.0.
- [Commits](https://github.com/golang/crypto/compare/v0.29.0...v0.31.0)

---
updated-dependencies:
- dependency-name: golang.org/x/crypto
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-12-18 06:22:10 +00:00
Christoph Kluge
559ce53ca4 Merge branch 'hotfix' of https://github.com/ClusterCockpit/cc-backend into hotfix 2024-12-17 15:14:27 +01:00
Christoph Kluge
ee2c5b58d7 fix: add missing sorting parameter to REST API call and test 2024-12-17 15:14:24 +01:00
Jan Eitzinger
d98d998106 Merge pull request #315 from ClusterCockpit/hotfix
Prepare Bugfix release 1.4.1
2024-12-10 16:54:17 +01:00
212c45e070 Prepare bug fix release 1.4.1 2024-12-10 16:45:05 +01:00
143fa9b6ed Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix 2024-12-10 16:36:22 +01:00
4849928288 Rename old column name for user
Fixes #314
2024-12-10 16:35:43 +01:00
Christoph Kluge
9248ee8868 fix: fix renamed column reference in searchbar workflow 2024-12-09 11:06:12 +01:00
Jan Eitzinger
1616d96732 Merge pull request #312 from ClusterCockpit/hotfix
Remove obsolete archive migration from build list
2024-12-05 10:43:38 +01:00
0bbedd1600 Remove obsolete archive migration from build list 2024-12-05 10:41:54 +01:00
Jan Eitzinger
c7e49644d8 Merge pull request #311 from ClusterCockpit/hotfix
Hotfix
2024-12-05 08:41:04 +01:00
010c903c74 Add known issues section to release notes 2024-12-05 08:35:10 +01:00
e4d12e3537 Merge branch 'master' into hotfix 2024-12-05 07:50:48 +01:00
051cc8384e Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix 2024-12-05 07:50:38 +01:00
49a94170d2 Add Fixme note for Energy calculation 2024-12-05 07:49:52 +01:00
Jan Eitzinger
42e8e37bd4 Merge pull request #309 from ClusterCockpit/devel
fix: Update to resampler handling different resolutions
2024-12-04 18:37:02 +01:00
Jan Eitzinger
5d2c350ce2 Merge pull request #310 from ClusterCockpit/hotfix
Hotfix
2024-12-04 18:12:19 +01:00
Aditya Ujeniya
85dc0362c1 fix: SimpleResampler fixed 2024-12-04 17:54:54 +01:00
Christoph Kluge
01c06728eb review footprint iconography and messages 2024-12-04 16:09:06 +01:00
Christoph Kluge
257250714d review polar plot component, adds min dataset 2024-12-04 15:22:19 +01:00
Aditya Ujeniya
3b769c3059 fix: Update to resampler handling different resolutions 2024-12-04 14:19:56 +01:00
Christoph Kluge
a7395ed45b remove config for polarPlotMetrics 2024-12-04 13:57:05 +01:00
Christoph Kluge
ab07c7928f fix: fix footprint logic, do not scale thresholds on multi node jobs 2024-12-04 13:56:00 +01:00
Christoph Kluge
b0c0d15505 fix stat filter url write 2024-12-04 10:55:29 +01:00
Jan Eitzinger
fcf50790da Merge pull request #307 from ClusterCockpit/hotfix
Prepare release v1.4.0
2024-12-04 06:39:35 +01:00
Christoph Kluge
1e43654607 Merge pull request #308 from ClusterCockpit/dev
Move to open Release PR
2024-12-03 17:19:09 +01:00
Christoph Kluge
4fecbe820d change order to match docs 2024-12-03 17:11:32 +01:00
Christoph Kluge
763c9dfa6b fix schema definition of apiAllowedIPs 2024-12-03 15:22:34 +01:00
9de5879786 Prepare release v1.4.0 2024-12-03 09:01:21 +01:00
Jan Eitzinger
9396e7492c Merge pull request #306 from ClusterCockpit/dev
fix: fix job list render for continuous mode on filter or sort changes
2024-12-03 07:47:46 +01:00
3ac3415178 Mark new ui options as required 2024-12-03 07:41:23 +01:00
1aae1c59d0 Make continous scroll the default 2024-12-03 07:27:10 +01:00
907e80a01c Update config json schema
Fixes #256
2024-12-03 07:26:36 +01:00
Christoph Kluge
8a10b69716 review findThresholds logic in metricPlot 2024-12-02 17:27:41 +01:00
Christoph Kluge
1a3cf7edd6 fix wrong var insert 2024-12-02 17:02:04 +01:00
Christoph Kluge
76d0fc979b fix: fix job list render for continuous mode on filter or sort changes 2024-12-02 12:49:43 +01:00
Jan Eitzinger
a42d8ece35 Merge pull request #305 from ClusterCockpit/dev
Fix Perl Skript to generate subCluster Config
2024-11-30 06:43:46 +01:00
Christoph Kluge
93377f53fc add lastThreshold to jobListRow 2024-11-29 14:15:15 +01:00
Christoph Kluge
c853d74ba0 Update frontend dependencies 2024-11-29 12:57:34 +01:00
Christoph Kluge
0b9f74f4f4 fix: fix plot render for summed metrics on scope change 2024-11-29 12:56:53 +01:00
Christoph Kluge
5da6baf828 fix: prevent jump to table head on continuous scroll load 2024-11-29 12:00:28 +01:00
5766945006 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2024-11-28 17:12:21 +01:00
a53d473b58 Update subcluster-generate Perl Skript
Fixes #278
2024-11-28 17:12:05 +01:00
Jan Eitzinger
d1207ad80e Merge pull request #304 from ClusterCockpit/dev
Dev
2024-11-28 15:23:02 +01:00
Christoph Kluge
e2efe71b33 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2024-11-28 15:18:14 +01:00
Christoph Kluge
2aef6ed9c0 fix: oversight error on redirect target 2024-11-28 15:18:07 +01:00
Jan Eitzinger
fcb6db0603 Merge pull request #303 from ClusterCockpit/dev
Fix Cookie settings, upgrade packages Fixes #301
2024-11-28 15:02:05 +01:00
01b1136316 Fix Cookie settings, upgrade packages 2024-11-28 14:58:33 +01:00
Jan Eitzinger
2512fe9e75 Merge pull request #302 from ClusterCockpit/dev
fix: solve inconsistencies with filters, fixes #280
2024-11-28 09:29:40 +01:00
Christoph Kluge
f89b5cd2ec fix: solve inconsistencies with filters, fixes #280 2024-11-27 18:43:56 +01:00
Jan Eitzinger
ab284ed208 Merge pull request #299 from ClusterCockpit/dev
feat: Add buffered channel with worker thread for job start API
2024-11-27 14:45:26 +01:00
Christoph Kluge
00a578657c feat: add edit of notice box content to admin settings 2024-11-27 10:50:11 +01:00
Christoph Kluge
38ce40ae7d feat: redirect to requested page after login, solves #281 2024-11-26 16:21:16 +01:00
e1be6c7138 Remove UpdateEnergy from UpdateFootprint Task
Conputing total energy for running jobs does not make any sense
2024-11-26 10:49:44 +01:00
28539e60b0 Regenerate Swagger, fix tests, cleanup 2024-11-26 07:02:53 +01:00
adb11b3ed0 Re-enable Footprint worker 2024-11-25 17:35:22 +01:00
Jan Eitzinger
f1e6dedd44 Merge pull request #300 from ClusterCockpit/improve_footprint_transactions
Improve footprint transactions
2024-11-25 17:08:46 +01:00
Christoph Kluge
8ea1454c06 improve transaction init error handling 2024-11-25 17:03:59 +01:00
81b8d578f2 feat: Add buffered channel with worker thread for job start API
Fixes #293
Refactoring on the way
2024-11-25 16:44:50 +01:00
Jan Eitzinger
16b11db39c Merge pull request #298 from ClusterCockpit/dev
Database migration porting and keyword cleanup
2024-11-24 08:24:09 +01:00
0d923cc920 Ignore generated test artefacts 2024-11-24 07:49:26 +01:00
c523e93564 Update to new db schema 2024-11-24 07:48:30 +01:00
d588798ea1 Update test sqlite db 2024-11-24 07:41:39 +01:00
a11f165f2a Cleanup 2024-11-24 07:09:31 +01:00
Christoph Kluge
d4f487d554 comment debug logging 2024-11-22 17:56:55 +01:00
Christoph Kluge
93d5a0e532 correct input for check 2024-11-22 16:59:18 +01:00
Christoph Kluge
00ddc462d2 expand check, change to zero init 2024-11-22 16:31:35 +01:00
Christoph Kluge
5f4a74f8ba add check on returned stats 2024-11-22 15:57:28 +01:00
Christoph Kluge
a8eff6fbd1 small logging changes 2024-11-22 15:08:53 +01:00
Christoph Kluge
baa7367ebe change array init to empty array 2024-11-22 13:39:59 +01:00
Christoph Kluge
69f8a34aac more logging 2024-11-22 13:36:26 +01:00
Christoph Kluge
21b3a67988 add timers, add else case for transaction add 2024-11-22 13:13:43 +01:00
Christoph Kluge
d89574ce73 Use repo.loadStats, move transaction init 2024-11-22 12:42:49 +01:00
ddeac6b9d9 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2024-11-21 15:54:58 +01:00
17906ec0eb Add down migrations for documentation 2024-11-21 15:54:46 +01:00
Christoph Kluge
311c088d3d removes debug logging 2024-11-21 15:47:09 +01:00
a2584d6083 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2024-11-21 15:03:55 +01:00
35bd7739c6 fix: Replace reserved keywords in database schemas
Port migration to mariadb
2024-11-21 15:02:30 +01:00
7f43c88a39 Add example config for mariadb backend 2024-11-21 14:54:04 +01:00
Christoph Kluge
fc1c54a141 fix: use left join to keep unmatched stats query result rows 2024-11-21 14:39:03 +01:00
Jan Eitzinger
2af111c584 Merge pull request #297 from ClusterCockpit/hotfix
Update README
2024-11-16 07:52:52 +01:00
c093cca8b1 Update README 2024-11-16 07:45:18 +01:00
Jan Eitzinger
2bb1b78ba4 Merge pull request #296 from ClusterCockpit/hotfix
Hotfix
2024-11-16 07:42:15 +01:00
3ab26172c4 Port tests to new job archive version 2024-11-16 07:03:29 +01:00
cdd45ce88b Fix importers and add Energy footprint to import 2024-11-16 06:36:55 +01:00
210a7d3136 Debugging initDB archive import
Footprint working
EnergyFootprint still missing
2024-11-14 19:13:07 +01:00
92ec64d80f Update demo config file 2024-11-14 19:10:55 +01:00
ff37f71fdb Increase job archive required version 2024-11-14 19:10:37 +01:00
6056341525 Remove obsolete Archive Migration Tool 2024-11-14 19:09:56 +01:00
Jan Eitzinger
075612f5bd Merge pull request #294 from ClusterCockpit/hotfix
Disable UpdateFootprint service for debugging
2024-11-12 06:37:44 +01:00
1a87ed8210 Disable UpdateFootprint service for debugging 2024-11-09 09:24:51 +01:00
Jan Eitzinger
c05ffeb16d Merge pull request #289 from ClusterCockpit/dev
Update Q4 2024
2024-11-09 09:05:25 +01:00
ee3710c5ed Merge branch 'master' into dev 2024-11-09 09:01:04 +01:00
4327c4b1f7 Start archive worker 2024-11-08 19:44:11 +01:00
492e56a098 Put privilege drop to previous location 2024-11-08 19:23:54 +01:00
f0257a2784 Drop privileges after server start 2024-11-08 19:16:56 +01:00
ec1ead89ab Switch back to previous meaning of energy metric attribute 2024-11-08 06:27:27 +01:00
Christoph Kluge
ae53e87aba Merge pull request #292 from ClusterCockpit/hotfix
Hotfix: add orderBy param to jobRepo.Find
2024-10-31 15:51:32 +01:00
Christoph Kluge
939dd2320a Cleanup debug logging, keep orderBy param for repo.Find 2024-10-31 15:47:45 +01:00
Christoph Kluge
2c8b73e2e2 add logged timing to homeroute calls 2024-10-31 14:34:32 +01:00
Christoph Kluge
eabc6212ea add debug logging for user context and web render 2024-10-31 13:36:27 +01:00
Christoph Kluge
c120d6517f change logging key, add args, add orderby id job.Find() 2024-10-30 16:24:58 +01:00
Christoph Kluge
597ee1dad7 change log to request and sql prints 2024-10-29 18:39:23 +01:00
Christoph Kluge
c4a901504d change debug format key 2024-10-29 18:25:41 +01:00
Christoph Kluge
f5cc5d07fd add more logging to rest api stopJobByRequest 2024-10-29 17:01:05 +01:00
Christoph Kluge
8a0e6c921c Merge pull request #291 from ClusterCockpit/hotfix
add logging to rest stopJobHandler
2024-10-29 08:21:33 +01:00
Christoph Kluge
bf1bff9ace fix tagManagement condition 2024-10-28 16:42:19 +01:00
Christoph Kluge
06f24e988f fix incorrect config conditions 2024-10-28 11:56:34 +01:00
Christoph Kluge
ae327f545e add logging to rest stopJobHandler 2024-10-25 15:23:49 +02:00
Christoph Kluge
35012b18c5 one more note 2024-10-25 14:47:09 +02:00
Christoph Kluge
9688bad622 note decision of implementaion Q4 2024-10-24 18:14:31 +02:00
Christoph Kluge
447b8d3372 Merge branch 'dev' into add_detailed_nodelist 2024-10-23 17:09:33 +02:00
Christoph Kluge
01102cb9b0 feat: add updateUserOnLogin config option for oidc, jwt 2024-10-23 16:17:47 +02:00
Christoph Kluge
934d1a6114 fix: use configured footprint statType for update 2024-10-23 16:16:28 +02:00
Christoph Kluge
6f74c8cb77 feat: make cron worker frequency configurable 2024-10-23 16:15:44 +02:00
Christoph Kluge
63b9e619a4 fix: fixed and changed to footprint update by transactions 2024-10-22 14:37:22 +02:00
Christoph Kluge
82e28f26d7 feedback: add jobID copy btn to jobInfo 2024-10-21 15:45:27 +02:00
Christoph Kluge
ca9fd96baa update frontend dependencies and save them 2024-10-18 10:08:43 +02:00
Christoph Kluge
39b22267d6 Update component descriptions 2024-10-16 16:03:31 +02:00
Christoph Kluge
60d7984d66 add notes 2024-10-16 14:16:31 +02:00
Christoph Kluge
33d219d2ac Add subCluster to node view info field 2024-10-16 13:05:03 +02:00
Christoph Kluge
85a77e05af edit nodeInfo string 2024-10-16 12:51:10 +02:00
Christoph Kluge
3dfeabcec6 simplify plotGrid, add cancel to metricSelect, improve metricPlot render logic 2024-10-16 12:41:15 +02:00
Christoph Kluge
673fdc443c Finish prototype implementation of nodelist view 2024-10-14 18:37:48 +02:00
Christoph Kluge
2f6e5a7648 Move common logic into systems view again
- adds backend log if subcluster for node not configured
2024-10-14 11:55:59 +02:00
Christoph Kluge
2cbe8e9517 Split systems view into node-overview and node-list 2024-10-11 12:30:55 +02:00
Christoph Kluge
2f0460d6ec feat: make quick select starttimes url copyable 2024-10-10 18:35:53 +02:00
Christoph Kluge
37f4ed7770 add additional indices for sorting performance 2024-10-09 17:52:46 +02:00
Christoph Kluge
e3104c61cb filter taglist scope visibility by role, add global tag handling to support role 2024-10-09 13:23:06 +02:00
Christoph Kluge
bc434ee8cb add managed projects, update navbar layout, fix small issues 2024-10-09 11:08:14 +02:00
Christoph Kluge
f4102b948e rework clientwidth binds and size defaults for histograms 2024-10-08 18:46:59 +02:00
Christoph Kluge
ed991de11a fix: ad dmissing resampleConfig handling to scope select 2024-10-08 17:54:12 +02:00
Christoph Kluge
322e161064 cleanup leftover 2024-10-08 17:36:28 +02:00
Christoph Kluge
1adc741cc2 remove dev logging 2024-10-08 17:32:51 +02:00
Christoph Kluge
4eff87bbf7 update frontend dependency manager version, adds license info 2024-10-08 17:31:47 +02:00
Christoph Kluge
fc6970d08a fix plotgrid display error, use plotheight default 2024-10-08 17:31:15 +02:00
Christoph Kluge
f616c7e1c6 remove width tags from slot defs 2024-10-08 15:26:09 +02:00
Christoph Kluge
89ec749172 experimental rework of metricplot render and resize handling 2024-10-08 15:25:31 +02:00
Christoph Kluge
182f0f2c64 fix: add missing default resolution case 2024-10-08 10:42:13 +02:00
Christoph Kluge
e3681495ce update frontend dependencies 2024-10-07 17:40:21 +02:00
Christoph Kluge
37415fa261 improve job list toolbar layouting, smaller layout fixes 2024-10-07 17:36:40 +02:00
Christoph Kluge
7243dbe763 replace plotTable with new bootstrap plotGrid component
- helps with narrow window sizes
- plotTable kept for now
2024-10-02 17:48:46 +02:00
Christoph Kluge
0ff5c4bedd Make global searchfield adaptive to screensize 2024-10-02 15:43:46 +02:00
Christoph Kluge
f047f89ad5 fix column count and add margins 2024-10-02 14:48:21 +02:00
Christoph Kluge
0eb0aa1d3b change default range to 12h, rework layout in system node views 2024-10-02 14:37:32 +02:00
Christoph Kluge
6019891591 add energy filterr in new component 2024-10-01 16:25:09 +02:00
Christoph Kluge
615281601c fix wrong flag labelling, change to kWh energy calculation 2024-10-01 14:58:19 +02:00
Christoph Kluge
82baf5d384 fix deepCopy of statisticsSeries for archived jobs 2024-10-01 12:48:32 +02:00
Christoph Kluge
6fe93ecb7e fix adaptive legend title 2024-10-01 11:42:46 +02:00
Christoph Kluge
b3222f3523 fix: archived statisticsSeries with mean data now shown again 2024-09-30 18:31:49 +02:00
Christoph Kluge
3b94863521 add sorting for job energy column 2024-09-30 18:30:26 +02:00
Christoph Kluge
582dc8bf46 add energy column index 2024-09-30 18:29:46 +02:00
Christoph Kluge
a9868fd275 display energySumary only if energy data is present 2024-09-30 16:43:38 +02:00
Christoph Kluge
218e56576a round calculated updateFootprint values to two digits 2024-09-30 16:33:28 +02:00
Christoph Kluge
c50e79375a fix ccb side of unintentionally added endpoint format change in ccms 2024-09-30 15:27:49 +02:00
Christoph Kluge
dcb8308f35 add icons to energySummary component 2024-09-30 12:27:32 +02:00
Christoph Kluge
183b310696 add base constant to tooltip 2024-09-27 13:48:14 +02:00
Christoph Kluge
c7d0c86d52 add missing template changes 2024-09-27 13:46:19 +02:00
Christoph Kluge
48225662b1 feat: display energy usage in job view
- optional emission constant config line added
2024-09-27 13:45:44 +02:00
Christoph Kluge
f53fc088ec fix bugs in autoupdater query builder returns 2024-09-25 18:05:04 +02:00
Christoph Kluge
05517fcbcd use direct db execution for autoupdaters
- transactions need to be reinvestigated
2024-09-25 18:04:29 +02:00
Christoph Kluge
18af51b0a4 improve tag list template 2024-09-25 13:24:01 +02:00
Christoph Kluge
ede3da7a87 improve tag scope clarity 2024-09-25 12:23:21 +02:00
Christoph Kluge
8e3327ef6a Merge branch 'sample_resolution_select' into dev 2024-09-24 17:43:15 +02:00
Christoph Kluge
827f6daabc Merge branch '275_tag_scope_jobview_rework' into dev 2024-09-24 17:25:20 +02:00
Christoph Kluge
2567442321 Merge branch 'master' into dev 2024-09-24 17:22:14 +02:00
Christoph Kluge
9cf5478519 Merge pull request #288 from ClusterCockpit/hotfix
fix: fix crashing job view if roofline metrics missing
2024-09-24 14:47:51 +02:00
Christoph Kluge
e5275311c2 fix: fix crashing job view if roofline metrics missing 2024-09-24 14:37:39 +02:00
Christoph Kluge
21e4870e4c feat: add configurability to frontend plot zoom 2024-09-24 11:13:39 +02:00
Christoph Kluge
beba7c8d2e fix tag count bug if names non-unique, set global as default scope if none entered 2024-09-19 15:21:32 +02:00
Christoph Kluge
fe35313305 handle tag management based on role 2024-09-19 11:15:46 +02:00
Christoph Kluge
d7a8bbf40b Rework tag and tag edit placement, add other feedback
- admin message shown primarily if exists
- comment demo summary tab
2024-09-18 17:23:29 +02:00
Aditya Ujeniya
f1893c596e Versioning to query endpoint 2024-09-17 14:36:42 +02:00
Christoph Kluge
6367c1ab4d Merge branch 'dev' into 275_tag_scope_jobview_rework 2024-09-17 14:32:06 +02:00
Christoph Kluge
9579887fc4 Merge branch '275_add_tag_scope' into 275_tag_scope_jobview_rework 2024-09-16 15:04:01 +02:00
Christoph Kluge
e29be2f140 fix missing scope field request for jobview 2024-09-16 15:03:38 +02:00
Christoph Kluge
2736b5d1ef change background color for tag listitems 2024-09-16 15:00:42 +02:00
Christoph Kluge
ff52fb16b6 Merge branch '275_add_tag_scope' into 275_tag_scope_jobview_rework 2024-09-16 13:55:17 +02:00
Christoph Kluge
ccbf3867e1 change global tag color from gray to magenta 2024-09-16 13:54:40 +02:00
Christoph Kluge
f0de422c6e rework tagManagement modal render 2024-09-11 11:28:11 +02:00
Christoph Kluge
64cc19b252 remove icon from metric select, change color 2024-09-10 16:53:34 +02:00
Christoph Kluge
26226009f0 Merge branch 'rework_jobview_header' into 275_tag_scope_jobview_rework 2024-09-10 16:44:56 +02:00
Christoph Kluge
d10e09da02 button width to tablecolumn width 2024-09-10 16:43:43 +02:00
Christoph Kluge
00a2e58fee Merge branch 'rework_jobview_header' into 275_tag_scope_jobview_rework 2024-09-10 12:35:21 +02:00
Christoph Kluge
b1cb45dfe6 add overflow-x to statsTable, use sveltestrap input 2024-09-10 12:14:34 +02:00
Christoph Kluge
a2951d1f05 Add message to tagManegement 2024-09-10 09:45:47 +02:00
Christoph Kluge
c0b1e97602 adds message if no tags attached to job 2024-09-10 09:23:01 +02:00
Christoph Kluge
71621a9dc4 Wrap plottable in job view 2024-09-09 19:01:07 +02:00
Christoph Kluge
b3ed2afebe feat: move tag management to new job view header 2024-09-09 18:06:13 +02:00
Christoph Kluge
704620baff Remove unnecessary bind 2024-09-09 11:41:14 +02:00
Christoph Kluge
8feb805167 Merge branch 'rework_jobview_header' into 275_tag_scope_jobview_rework 2024-09-09 11:39:52 +02:00
Christoph Kluge
065b32755a small size correction if footprint not shown 2024-09-09 11:09:21 +02:00
Christoph Kluge
1b5f4bff2c feat: SyncUserOnLogin now updates name of token logged user 2024-09-09 10:32:26 +02:00
Christoph Kluge
8e1c5a485f Improve grid scaling 2024-09-06 12:00:33 +02:00
5fa6c9db35 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2024-09-06 11:25:03 +02:00
5482b9be2c Add debug output 2024-09-06 11:24:54 +02:00
Christoph Kluge
7400273b0a Manual merge changes not staged last time ... 2024-09-05 17:27:18 +02:00
Christoph Kluge
0b7cdde4a0 Merge branch 'dev' into sample_resolution_select
- Moved resample changes to metricDataDispatcher
- Added res argument to archiver, updateFootprintService
2024-09-05 17:26:43 +02:00
Christoph Kluge
d5382aec4f Merge branch 'dev' into 275_add_tag_scope 2024-09-05 16:44:41 +02:00
Christoph Kluge
df484dc816 rework job view header, change footprint to summary component 2024-09-05 16:44:03 +02:00
Christoph Kluge
7ea4086807 Rework sqlite indices in v8 migration 2024-09-05 15:06:38 +02:00
Christoph Kluge
b04bf6a951 fix missing condition in migration 2024-09-05 15:00:43 +02:00
7c33dcf630 Bugfix in footprint update 2024-09-05 14:58:08 +02:00
5e65e21f0b Add quotes in duration query 2024-09-05 12:38:39 +02:00
53ca38ce53 Add debug output to duration query 2024-09-05 11:18:00 +02:00
Christoph Kluge
398e3c1b91 feat: split concurrent jobs list to own scrollable component 2024-09-04 10:23:23 +02:00
508978d586 Initial attempt to update footprints in transaction 2024-09-03 15:59:01 +02:00
e267481f71 Cleanup transaction api 2024-09-03 15:40:02 +02:00
Christoph Kluge
193bee5ac8 fix: prevent addition of existing scopes to table 2024-09-03 14:16:16 +02:00
f58efa2871 Allow to combine job update queries 2024-09-03 13:41:00 +02:00
6568b6d723 Prepare transaction API for general usage 2024-09-03 13:40:11 +02:00
Christoph Kluge
4b1b34d8a7 remove logging, remove forced change to node scope 2024-09-03 13:10:44 +02:00
39c09f8565 Introduce job duration update task 2024-09-03 10:03:38 +02:00
Christoph Kluge
275a77807e fix typo in migration 2024-09-03 09:40:00 +02:00
Christoph Kluge
6443541a79 fix SQL migration syntax 2024-09-03 09:34:45 +02:00
Christoph Kluge
5eb6f7d307 fix: user name join not required for normal jobStats 2024-09-02 18:45:33 +02:00
Christoph Kluge
bce2a66177 Merge branch 'change_resolution_on_zoom' into sample_resolution_select 2024-09-02 18:23:13 +02:00
Christoph Kluge
7602641909 feat: change to resolution increase on zoom 2024-09-02 18:22:34 +02:00
Christoph Kluge
54f3a261c5 Rewrite sqlite indices from scratch for v8 migration 2024-09-02 18:20:32 +02:00
Christoph Kluge
906bac965f feat: add dropdown to user and project list navbar 2024-09-02 17:55:12 +02:00
Christoph Kluge
4ec1de6900 fix constant gql query 2024-09-02 17:54:45 +02:00
Christoph Kluge
8ded131666 Change user list name lookup to join 2024-09-02 17:54:25 +02:00
47b14f932e Start footprint service 2024-09-02 12:07:44 +02:00
Aditya Ujeniya
838ebb3f69 Updates res 2024-09-01 22:54:43 +02:00
c459724114 Resolve build errors 2024-08-30 13:50:49 +02:00
b0c9d1164d Add initial version of footprint update service
Not tested yet
2024-08-30 07:22:40 +02:00
7c51d88501 Add stub for Footprint update service 2024-08-29 08:45:04 +02:00
5b03cf826b feat: Add total energy and energy footprint 2024-08-29 07:26:49 +02:00
f305863616 Bugs fixed in unit tests and archiver init 2024-08-28 12:26:35 +02:00
db5809d522 Move rest of archiveing code into new archive package 2024-08-28 11:13:54 +02:00
Jan Eitzinger
83df6f015c Merge pull request #287 from ClusterCockpit/refactor-archiving
Refactor archiving
2024-08-28 10:14:46 +02:00
e7231b0e13 Finish refactoring
Add new packages:
- metricDataDispatcher
- archiver
2024-08-28 10:03:04 +02:00
Christoph Kluge
cff60eb51c increase server timeout limit, improve and add db indices
- change energy footprint key to string
2024-08-27 17:43:48 +02:00
f914a312f5 Introduce metricDataDispatcher
Does not compile yet
2024-08-27 16:44:16 +02:00
56ebb301ca Start to restructure
Does not compile
2024-08-27 10:14:33 +02:00
Christoph Kluge
a59df12595 init basic proof of concept 2024-08-26 17:37:23 +02:00
Christoph Kluge
5cc7fc6ccb Merge branch 'sample_resolution_select' of https://github.com/ClusterCockpit/cc-backend into sample_resolution_select 2024-08-26 09:55:36 +02:00
Christoph Kluge
55027cb630 fix: add resolution 60 default to ccms nodeData query 2024-08-26 09:55:33 +02:00
Aditya Ujeniya
036eba68e1 Fix for resampler 2024-08-25 16:13:43 +02:00
Christoph Kluge
d34e0d9348 fix: omit resources prop from metricPlot, use series for legend instead 2024-08-23 16:59:45 +02:00
Christoph Kluge
31765ce0ef Merge branch 'dev' into 275_add_tag_scope 2024-08-23 14:52:42 +02:00
Christoph Kluge
9fe7cdca92 fix: fix plot labeling if specific host selected, hide loadall if only node returned 2024-08-23 13:53:15 +02:00
Christoph Kluge
adc3502b6b cleanup dev logline 2024-08-23 13:37:42 +02:00
Christoph Kluge
95fe369648 fix: add additionally loaded scopes to statsTable again 2024-08-23 13:26:56 +02:00
Christoph Kluge
01845a0cb7 add comment regarding metric data load 2024-08-22 18:33:18 +02:00
Christoph Kluge
708eaf4178 fix dev leftovers 2024-08-22 17:55:21 +02:00
Christoph Kluge
d629a58712 Merge branch 'dev' into sample_resolution_select 2024-08-22 17:33:16 +02:00
Christoph Kluge
90886b63d6 Merge pull request #286 from ClusterCockpit/devel
Sampling Feature for archived and fresh data
2024-08-22 17:16:28 +02:00
Christoph Kluge
084f89fa32 fix: fix svelte source paths in makefile 2024-08-22 14:46:27 +02:00
Aditya Ujeniya
ceb3a095d8 Sampling Feature for archived and fresh data 2024-08-22 14:29:51 +02:00
Christoph Kluge
1758275f11 fix: fix getMetricConfigDeep util function
- threw error for mismatching metric availability between clusters
2024-08-22 14:01:27 +02:00
Christoph Kluge
e74e506ffe cleanup outdated code 2024-08-20 16:41:35 +02:00
Christoph Kluge
599a36466a fix new data reactivity for accelerators 2024-08-20 14:52:13 +02:00
Christoph Kluge
613e128cab cleanup dev logging 2024-08-20 11:51:38 +02:00
Christoph Kluge
e4f8022b7a change to one reactive metric data load on two variables 2024-08-20 11:39:19 +02:00
Jan Eitzinger
5603c41900 Merge pull request #284 from ClusterCockpit/Refactor-job-footprint
Refactor job footprint
2024-08-19 12:15:59 +02:00
a8a27c9b51 Add project index to job table 2024-08-19 12:11:53 +02:00
Christoph Kluge
b70de5a4be Handle single update data 2024-08-16 16:35:17 +02:00
Christoph Kluge
b1fd07cd30 add single update gql queries to metric wrapper 2024-08-16 14:50:31 +02:00
Christoph Kluge
6ab2e02fe6 Merge branch 'Refactor-job-footprint' into sample_resolution_select 2024-08-16 13:05:09 +02:00
Christoph Kluge
5535c5780c Merge branch 'Refactor-job-footprint' of https://github.com/ClusterCockpit/cc-backend into Refactor-job-footprint 2024-08-15 14:33:08 +02:00
Christoph Kluge
49e0a2c055 fix: add compatibility for footprint metrics without config 2024-08-15 14:33:04 +02:00
AmritanshuV
efbe53b6b4 Rules 2024-08-15 12:40:57 +02:00
5e074dad10 Resolve error in migration 2024-08-15 12:39:14 +02:00
d6a88896d0 Refactor: Reduce struct memory size 2024-08-15 12:36:21 +02:00
5c99f5f8bb Only add footprint columns if not 0 2024-08-15 12:35:11 +02:00
e1faba0ff2 Update cluster json schema 2024-08-15 10:39:32 +02:00
ba2f406bc0 Extend sqlite db migration 2024-08-15 09:41:54 +02:00
9b6db4684a Refactor: Remove redundant code 2024-08-15 08:53:49 +02:00
Christoph Kluge
561fd41d5d fix: add accelerator scope to to-be archived scopes
- if numAcc > 0
- fixes Add accelerator scope to archive requests #282
2024-08-13 17:49:28 +02:00
Christoph Kluge
ce9995dac7 fix: fix wrongly inserted gql request and import path error 2024-08-08 12:29:45 +02:00
Christoph Kluge
0afaea9513 initial commit with example event dispatch 2024-08-08 12:28:36 +02:00
Christoph Kluge
9b5c6e3164 fix StartJobTest, add tag_scope to migration 2024-08-05 10:37:42 +02:00
Christoph Kluge
e6ebec8c1e fix TestGetTags test, was missing scope and ctx 2024-08-05 10:19:00 +02:00
Christoph Kluge
2551921ed6 fix: wrong display of tag after filter select
- exitent pills were non-updated on change of key
2024-08-02 18:14:24 +02:00
Christoph Kluge
e02575aad7 adds comments 2024-08-02 16:42:55 +02:00
Christoph Kluge
ff3502c87a fix: fix tag filter results
- displayed multiple identical entries before
- job count was incorrect before
2024-08-02 16:11:47 +02:00
Christoph Kluge
017f9b2140 feat: Add tag scopes to front and backend, initial commit 2024-08-01 18:59:24 +02:00
Christoph Kluge
c80d3a6958 fix: errors in import paths 2024-08-01 16:11:23 +02:00
Christoph Kluge
3ca1127685 Restructure frontend svelte file src folder
- Goal: Dependency structure mirrored in file structure
2024-07-26 12:34:18 +02:00
Christoph Kluge
18369da5bc Fix small oversight. remove wip plot component 2024-07-26 10:46:13 +02:00
Christoph Kluge
e65100cdc8 Add vscode @component comment to every svelte file, remove unused js exports 2024-07-25 17:10:00 +02:00
Christoph Kluge
6a1cb51c2f Refactor svelte frontend
- Adapt to new metricConfig logic
- Footprint-Metrics generalized for bar card
- Footprint-Metrics in stats filter and sorting
- Frontend always uses GQL, except adminOptions
- Job View will load scopes for all metrics on request
2024-07-22 15:41:33 +02:00
c4d93e492b Remove bugs in main init 2024-07-20 10:03:14 +02:00
c2f72f72ac Update go dependencies 2024-07-20 08:59:51 +02:00
721b6b2afa Change footprint variabel from bool to string
The footprint variable also indicates the type of statistic used now
2024-07-20 08:59:07 +02:00
b6f011c669 Move footprint update task placeholder to taskmanager 2024-07-16 12:34:27 +02:00
801607fc16 Refactor main
Convert components to Singletons
Restructure main package
Reduce dependencies
2024-07-16 12:08:10 +02:00
01a4d33514 Refactor: Archive workers and Tasks
Work in progress
2024-07-14 11:18:38 +02:00
e348ec74fd Fix bugs in stats.go 2024-07-12 14:08:48 +02:00
0458675608 Convert histogram query to json keys 2024-07-12 13:42:12 +02:00
c61ffce0e9 Make job query on metric stats generic 2024-07-12 13:21:19 +02:00
68a97dc980 Add footprint to global metric list 2024-07-12 13:20:54 +02:00
a07d167390 Fix build error with updated prometheus client 2024-07-12 09:17:31 +02:00
Christoph Kluge
a8721dcc69 Regenerate gql after internal merge 2024-07-11 17:37:53 +02:00
Christoph Kluge
68cf952ac6 Merge branch 'Refactor-job-footprint' of https://github.com/ClusterCockpit/cc-backend into Refactor-job-footprint 2024-07-11 17:33:21 +02:00
Christoph Kluge
e14d6a81fe fix: fix db migration to v8, changes key name to cpu_load 2024-07-11 17:24:33 +02:00
Christoph Kluge
a4912893a8 Frontend refactor backend changes 2024-07-11 17:23:59 +02:00
0adfb631ef Update go version to 1.22 for Github test workflow 2024-07-11 17:11:01 +02:00
b64ce1f67f Add LowerIsBetter Metric boolean. Upgrade dependencies. 2024-07-11 16:58:12 +02:00
e8e3b1595d Switch to Go 1.22 to get rid of global loop variable bug 2024-07-11 16:12:20 +02:00
f1427d5272 Add global metric list including graphQL query 2024-07-11 11:09:14 +02:00
Christoph Kluge
bf6b87d65c Fix circular import after merge 2024-07-09 09:50:32 +02:00
Christoph Kluge
0240997257 Merge branch '263_use_median_for_statsseries' into Refactor-job-footprint 2024-07-09 09:28:21 +02:00
Christoph Kluge
f1e341f0b9 Initial commit for frontend refactor 2024-07-09 09:17:50 +02:00
a54acb8c42 Merge branch '264_user_api_access' into Refactor-job-footprint 2024-07-05 16:17:57 +02:00
c6ede67589 Add energy footprint 2024-07-05 16:16:01 +02:00
Christoph Kluge
11176da5d8 Merge branch 'Refactor-job-footprint' into 264_user_api_access 2024-07-05 16:11:42 +02:00
Christoph Kluge
0a604336c4 Fix other apitest subtests 2024-07-05 15:42:08 +02:00
Christoph Kluge
be9df7649f fix: setup user in api test config 2024-07-05 15:25:24 +02:00
Christoph Kluge
63fb923995 fix: fix api test router init 2024-07-05 13:16:21 +02:00
Christoph Kluge
3afe40083d rename api userconfig to frontend, return json on api auth error 2024-07-05 11:48:06 +02:00
Christoph Kluge
9d4767539c Restructure config frontend, add user jwt request 2024-07-04 17:30:16 +02:00
ac9bba8b5b Restructure and simplify job repo 2024-07-04 15:05:24 +02:00
80c46bea7f Fix bugs and failed testcases 2024-07-04 14:14:27 +02:00
Christoph Kluge
614f694777 fix typo in api url 2024-07-04 11:41:17 +02:00
Christoph Kluge
1072d7b449 Improve auth handling of rest apis used in frontend for compatibility 2024-07-04 11:16:45 +02:00
1b70596735 Fix and test subcluster Config 2024-07-04 06:49:59 +02:00
Christoph Kluge
61eebc9fbd Rework initial commit
- moved frontend configuration api to new subrouter for compatibility
2024-07-03 17:24:26 +02:00
b05909969f Add test for clusterConfig 2024-07-03 12:11:43 +02:00
bd89ce7cc9 Extend schema and start Unit test implementation
Does not compile and work yet
2024-07-02 10:13:11 +02:00
130613b717 Fix build errors
Code not yet functional
2024-06-28 17:08:28 +02:00
b3c1f39a0e Merge branch 'master' into Refactor-job-footprint 2024-06-28 16:50:04 +02:00
97c807cd33 Add migration for footprint 2024-06-28 16:49:24 +02:00
aede5f71ec Introduce adapted graphql schema 2024-06-28 16:49:02 +02:00
786770f56a Start to convert to new footprint layout 2024-06-28 16:48:10 +02:00
Jan Eitzinger
74d4f00784 Merge pull request #276 from ClusterCockpit/hotfix
Hotfix
2024-06-28 15:43:54 +02:00
d61c4235dc Merge branch 'master' into hotfix 2024-06-28 15:41:52 +02:00
e8794b8c79 Add graphql generation target to Makefile 2024-06-28 15:41:11 +02:00
552da005dc Add make target for swagger UI generator 2024-06-26 05:41:42 +02:00
Jan Eitzinger
51452d2e68 Merge pull request #272 from ClusterCockpit/hotfix
Export package runtimeEnv
2024-06-25 07:16:22 +02:00
5c5484b4d2 Export package runtimeEnv 2024-06-25 07:12:46 +02:00
Jan Eitzinger
9974a851e8 Merge pull request #271 from ClusterCockpit/hotfix
Prepare release 1.3.1
2024-06-22 08:59:35 +02:00
6c0bfc6c35 Prepare release 1.3.1 2024-06-22 08:55:37 +02:00
Christoph Kluge
41bbd203cc Merge pull request #270 from ClusterCockpit/hotfix
fix: make foorprint from statsSeries nullsafe
2024-06-21 09:38:16 +02:00
Christoph Kluge
4344c26bef fix: make foorprint from statsSeries nullsafe 2024-06-19 13:12:51 +02:00
Jan Eitzinger
e1c1c06fb2 Merge pull request #268 from ClusterCockpit/hotfix
Hotfix
2024-06-14 14:27:13 +02:00
Christoph Kluge
70e63764ff fix: allow single partial errors on otherwise non-empty returned metric array 2024-06-13 12:38:29 +02:00
Christoph Kluge
d10f3e3af6 add maxwidth to projects column 2024-05-27 15:00:00 +02:00
Christoph Kluge
a4397d5447 fix: add scramble to textfilter component 2024-05-27 12:09:55 +02:00
Christoph Kluge
320c87a1db fix: add additional 30d fitler to searchbar fallback handling 2024-05-27 11:11:25 +02:00
Christoph Kluge
8d1228c9e8 feat: rework list searchbar, adds project-specific mode, add to user-joblist 2024-05-23 15:43:09 +02:00
Christoph Kluge
420bec7c46 fix: fix jobname and arrayjobid timeouts by adding additional 30d filter
- improve archive worker logs
- add arrayjobid filter to url if used
2024-05-23 11:53:23 +02:00
Christoph Kluge
ba1658beac fix: correct selectable histogram placement in status view 2024-05-22 18:50:52 +02:00
Christoph Kluge
575753038b feat: add jobname filter to joblist textfilter
- allows combination of filters now including jobname
- rename component
2024-05-22 18:22:35 +02:00
Christoph Kluge
061c9f0979 fix: deselected metrics were marked as missing on new jobview load 2024-05-22 15:57:22 +02:00
Christoph Kluge
b48d1b8ad6 fix: correct status view columns on mobile displays 2024-05-22 14:21:54 +02:00
dff7aeefb8 Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix 2024-05-16 11:19:00 +02:00
54f7980162 fix: Add required key to init config file 2024-05-16 11:18:57 +02:00
Christoph Kluge
684cb5a376 feat: change statistics render of metric plot to min/max/median
- #263
2024-05-08 16:17:42 +02:00
Christoph Kluge
597bccc080 fix: add SQL JSON validity check to meta_data query 2024-05-06 13:15:15 +02:00
Christoph Kluge
72557fd0bf feat: add statistics series render to job view metric plots 2024-05-02 16:32:01 +02:00
Jan Eitzinger
0b2f2214f9 Merge pull request #259 from ClusterCockpit/hotfix
Hotfix: Improve hasNextPage and jobName Queries
2024-04-26 12:20:52 +02:00
Christoph Kluge
ef51e69ffb feat: Add roofline color scale for time information 2024-04-26 11:11:55 +02:00
Christoph Kluge
c9eb40f455 fix: fix metricPlot y zoom reset 2024-04-25 16:59:27 +02:00
Christoph Kluge
b66750339d add default value, remove unused argument 2024-04-25 16:59:04 +02:00
Christoph Kluge
136460567c Feat: Add by-user setting for paging type
- Solves Add User-Configuration for Infinite Scroll #262
2024-04-25 15:00:53 +02:00
Christoph Kluge
f80123c85d Fix: Add missing nullsafe for admin user table 2024-04-24 13:47:29 +02:00
Christoph Kluge
a22340196f Fix: Improve jobName query by parsing DB field as JSON
- No DB mirgration required
- SQLite internal EXTRACT function used
2024-04-22 12:14:40 +02:00
Christoph Kluge
cbaeffde2c fix: improve speed of hasNextPage query for infinite scroll 2024-04-22 11:29:31 +02:00
649d50812b Merge branch 'master' into 134-job-tagging 2024-04-22 11:03:13 +02:00
b67f5436f8 Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix 2024-04-21 15:04:09 +02:00
b637ddeb28 Refactor and reformat userConfig 2024-04-21 15:04:00 +02:00
Jan Eitzinger
a20b7eacd6 Merge pull request #258 from ClusterCockpit/hotfix
Hotfix
2024-04-15 12:58:52 +02:00
6df639a0c3 Prepare Release 1.3.0 2024-04-15 12:54:50 +02:00
Christoph Kluge
d4a9887532 Merge branch 'master' into hotfix 2024-04-15 10:39:00 +02:00
Christoph Kluge
79b08a181d fix: trigger continuous load condition earlier 2024-04-15 10:36:26 +02:00
Christoph Kluge
758cef1bd3 Merge pull request #257 from ClusterCockpit/hotfix
Hotfix
2024-04-12 15:43:25 +02:00
fb8bbea99d Remove year in copyright notice 2024-04-11 23:04:30 +02:00
9b261a4778 Merge branch 'master' into hotfix 2024-04-10 14:24:12 +02:00
Christoph Kluge
aafa29db8b fix: add acc scope to job query if acc >= 1 2024-04-03 14:15:04 +02:00
Jan Eitzinger
896c39f9bc Merge pull request #255 from ClusterCockpit/dependabot/go_modules/github.com/go-jose/go-jose/v3-3.0.3
Bump github.com/go-jose/go-jose/v3 from 3.0.1 to 3.0.3
2024-03-29 06:09:01 +01:00
dependabot[bot]
3a97ff7f57 Bump github.com/go-jose/go-jose/v3 from 3.0.1 to 3.0.3
Bumps [github.com/go-jose/go-jose/v3](https://github.com/go-jose/go-jose) from 3.0.1 to 3.0.3.
- [Release notes](https://github.com/go-jose/go-jose/releases)
- [Changelog](https://github.com/go-jose/go-jose/blob/v3.0.3/CHANGELOG.md)
- [Commits](https://github.com/go-jose/go-jose/compare/v3.0.1...v3.0.3)

---
updated-dependencies:
- dependency-name: github.com/go-jose/go-jose/v3
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-03-29 05:06:00 +00:00
Jan Eitzinger
7518c41fab Merge pull request #254 from ClusterCockpit/dependabot/go_modules/github.com/containerd/containerd-1.6.26
Bump github.com/containerd/containerd from 1.6.18 to 1.6.26
2024-03-29 06:04:58 +01:00
dependabot[bot]
8cb00a5340 Bump github.com/containerd/containerd from 1.6.18 to 1.6.26
Bumps [github.com/containerd/containerd](https://github.com/containerd/containerd) from 1.6.18 to 1.6.26.
- [Release notes](https://github.com/containerd/containerd/releases)
- [Changelog](https://github.com/containerd/containerd/blob/main/RELEASES.md)
- [Commits](https://github.com/containerd/containerd/compare/v1.6.18...v1.6.26)

---
updated-dependencies:
- dependency-name: github.com/containerd/containerd
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-03-28 16:45:31 +00:00
Jan Eitzinger
baa51db26c Merge pull request #252 from ClusterCockpit/20_infinite_scroll
20 infinite scroll
2024-03-28 17:43:02 +01:00
Christoph Kluge
fc260b2291 fix number of cols to prevent uplot overflow
-relates to Broken layout  in status view for roofline plot #225
2024-03-28 17:26:31 +01:00
Christoph Kluge
43ebb01b63 fix: move scroll event behind condition 2024-03-28 15:57:24 +01:00
Jan Eitzinger
067dc0df5d feat: Add OpenID Connect Authentication support
236 user authentication using keycloak or any openid client for using external auth providers such as ldap GitHub google
2024-03-28 14:36:58 +01:00
6828c97415 Add central function to persist users on Login 2024-03-28 14:22:23 +01:00
50401e0030 Fix conditional rendering of OIDC button in login 2024-03-28 13:18:25 +01:00
c3d2508693 Update package deps after merge 2024-03-28 12:09:08 +01:00
642fd5cc91 Merge branch 'master' into 236-user-authentication-using-keycloak-or-any-openid-client-for-using-external-auth-providers-such-as-ldap-github-google 2024-03-28 12:07:58 +01:00
e8fb5a0030 Add OpenID Connect authentication
Fixes #236
Template conditional not yet working
Needs more testing
2024-03-28 12:01:13 +01:00
Christoph Kluge
0dee5073c6 fix: make hasnextpage optional parameter, use only if inf scroll configured 2024-03-26 16:27:04 +01:00
Christoph Kluge
b9b452f043 feat: prototype infinite scroll implementation 2024-03-26 15:56:07 +01:00
Jan Eitzinger
ddd3fad1c6 Merge pull request #251 from ClusterCockpit/hotfix
Accelerator ID Display Bugs and Footprint
2024-03-25 10:35:48 +01:00
Christoph Kluge
1f5723a97e Merge branch 'hotfix' of https://github.com/ClusterCockpit/cc-backend into hotfix 2024-03-22 16:10:35 +01:00
Christoph Kluge
5a177c952d fix: multiple accs with identical label, cloned data for single acc
- GPU id label in job view statistic table is always the same on multi GPU jobs #239
- Multiple accelerators listed in plot despite using only one #241
2024-03-22 16:10:30 +01:00
Jan Eitzinger
86e456d152 Merge pull request #250 from ClusterCockpit/hotfix
Hotfix
2024-03-22 09:54:29 +01:00
03895f9e45 Swag dependency needs at least Go 1.20 2024-03-22 09:41:18 +01:00
5c79f44055 Clarify functionality of gen-keypair tool 2024-03-22 08:59:59 +01:00
83c38e74db Refactoring: Reduze bytesize of structs. 2024-03-22 08:59:35 +01:00
1e5f2944cf Upgrade dependencies. Port to jwt-auth v5. 2024-03-21 22:02:59 +01:00
Jan Eitzinger
e45ecbdef7 Merge pull request #249 from ClusterCockpit/hotfix
Hotfix
2024-03-19 16:21:47 +01:00
c65694b36c Add tags and metadata to job queries. Fix query parameter handling.
Fixes #248
2024-03-19 16:18:43 +01:00
0005469101 Refactor 2024-03-19 16:16:02 +01:00
Christoph Kluge
60b56bd41a Fix: Simplify footprint logic, fix aggregated sum values 2024-03-18 18:57:15 +01:00
Jan Eitzinger
81fe492655 Merge pull request #247 from ClusterCockpit/hotfix
Hotfix
2024-03-15 09:30:48 +01:00
Christoph Kluge
849b7e038d Fix: make footprint display configurable app-wide
- note: requires full ui-defaults object in config
2024-03-14 15:14:19 +01:00
Christoph Kluge
82f5257cf1 fix merge bugs 2024-03-14 14:24:54 +01:00
Christoph Kluge
e347659db4 moved module context script 2024-03-14 11:09:18 +01:00
Christoph Kluge
7940317857 Merge branch 'hotfix' of https://github.com/ClusterCockpit/cc-backend into hotfix 2024-03-14 11:08:37 +01:00
Christoph Kluge
58415ab5c3 Adapt for accs in shared threshold s 2024-03-14 10:35:14 +01:00
Jan Eitzinger
1176974a78 Merge pull request #245 from pc2/master
Disable foreign key check while updating database
2024-03-14 09:42:24 +01:00
Michael Schwarz
ce792426e6 Disable foreign key check while updating database 2024-03-14 09:15:15 +01:00
e92e727279 Extend oidc auth provider 2024-03-13 17:09:36 +01:00
f761900a3e Add initial code for oidc authentication support 2024-03-13 09:37:12 +01:00
32a57661fd Upgrade frontend dependencies
Change to most recent @sveltestrap/sveltestrap
Reformat with Svelte LSP
2024-03-09 10:30:40 +01:00
Jan Eitzinger
5004e44934 feat: Add jobs endpoint to retrieve job meta and all job metric data
203 make full jobarchive available per simple api call
2024-03-08 16:41:57 +01:00
99d55f05f8 feat: Add cluster config endpoint to rest api 2024-03-08 16:35:30 +01:00
9fd839fad8 Add rest endpoint to get all job data
Fixes #203
2024-03-08 15:31:34 +01:00
Jan Eitzinger
1c7cc9e16f fix: Adapt tag db queries to also work with mysql/mariadb
231 sql statement syntax
2024-03-08 11:32:30 +01:00
06d01962a6 feat: Allow to revert db to previous version 2024-03-08 11:28:26 +01:00
2c2c1accb5 Allow up and down migration of database 2024-03-08 10:58:45 +01:00
105b7eabf0 Add migration and introduce dirty flag handling 2024-03-08 10:47:32 +01:00
Jan Eitzinger
de1d83e1a9 Merge branch 'master' into 231-sql-statement-syntax 2024-03-08 09:05:13 +01:00
Jan Eitzinger
ce97780741 feat: Add rest endpoint to add/edit Metadata entry
Add rest endpoint edit_meta including helper routines
2024-03-08 09:01:39 +01:00
e81e56ea1d Add rest endpoint edit_meta including helper routines
Fixes #219
2024-03-08 08:51:05 +01:00
aa6336ea1e Refactor
Reformat.
Convert to query builder.
Add descriptive error log messages.
2024-03-06 14:50:08 +01:00
dd887cbb1f Do all tag queries with query builder
Fix bug in mysql database initialization.
Fixes #231
2024-03-06 13:47:52 +01:00
Jan Eitzinger
860053be67 Merge pull request #240 from ClusterCockpit/hotfix
fix: Return on error from metricstore
2024-02-23 10:17:45 +01:00
5a4671b7b1 Always return on metricstore error. 2024-02-23 10:08:41 +01:00
Christoph Kluge
ec581e3509 Adapt normal marker line for shared jobs 2024-02-16 16:57:02 +01:00
e7ae9dd06d Cleanup README 2024-02-13 11:55:00 +01:00
0c7f55ff8d Remove obsolete package build rule
In case of cc-backend it is recommended to setup by hand or use the prebuild binary.
2024-02-13 11:54:50 +01:00
bcd7f47409 Transfer docs to dedicated doc webpage 2024-02-13 11:54:02 +01:00
476caebe7f Merge branch 'master' into hotfix 2024-02-13 11:05:09 +01:00
Christoph Kluge
dadc81c026 Add title to logout btn 2024-02-13 10:06:52 +01:00
Christoph Kluge
cc719d0ae5 Add Docs to Navbartools, move logout btn 2024-02-13 09:59:38 +01:00
Christoph Kluge
53af79cf0d Match mem_used color logic for footprint with plots 2024-02-12 17:12:04 +01:00
Christoph Kluge
f534ad66e1 Merge pull request #238 from ClusterCockpit/hotfix
Hotfix
2024-02-09 17:26:33 +01:00
Christoph Kluge
2d8cf02296 Add nullsafe to footprint mean gather 2024-02-09 17:19:58 +01:00
Christoph Kluge
71386f8466 Fix footprint logic for metrics equal zero 2024-02-09 17:09:08 +01:00
Christoph Kluge
c897c8e56b Add missing rounding func 2024-02-09 17:06:46 +01:00
Christoph Kluge
2036069051 Remove unresponsive histogram selections 2024-02-09 16:49:56 +01:00
Christoph Kluge
be6c63e526 Merge pull request #237 from ClusterCockpit/hotfix
Fix pageload block due to missing nullsafe
2024-02-09 16:26:16 +01:00
Christoph Kluge
a2af9c152a Fix pageload block due to missing nullsafe 2024-02-09 16:21:11 +01:00
Jan Eitzinger
63f3dc926c Merge pull request #233 from ClusterCockpit/214_user_status_histograms
214 user status histograms
2024-02-08 12:28:13 +01:00
Christoph Kluge
21dde870c6 Merge branch 'master' into 214_user_status_histograms 2024-02-08 12:26:07 +01:00
Jan Eitzinger
04f37a85ce Merge pull request #232 from ClusterCockpit/196_add_footprint
196 add footprint
2024-02-08 12:07:42 +01:00
10a332083b Merge branch 'master' into 196_add_footprint 2024-02-08 12:03:00 +01:00
Christoph Kluge
6818d1de62 Resolve pullrequest comments 2024-02-07 13:26:13 +01:00
Jan Eitzinger
1b10b75e25 Merge pull request #230 from ClusterCockpit/hotfix
Intermediate Hotfix Merge
2024-01-18 14:54:02 +01:00
Christoph Kluge
b829a5aafe Improve binned data histogram legends 2023-12-13 11:58:14 +01:00
Christoph Kluge
07073e290a feat: add selectable histograms to status view 2023-12-12 16:46:03 +01:00
Christoph Kluge
ee6d286cd7 Small corrections 2023-12-12 15:42:14 +01:00
Christoph Kluge
119637cb9b Fix using crossjoin arguments not used 2023-12-12 15:07:23 +01:00
Christoph Kluge
ee4097a2dd Add missing filters to crossjoinquery 2023-12-11 13:55:56 +01:00
Christoph Kluge
1185737eaa Add metrics to histoselect, add userfilters
- edit struct to make only count return required
2023-12-08 12:03:04 +01:00
Christoph Kluge
7d14086e54 Rework histogramselection, fix reactivity 2023-12-06 12:58:03 +01:00
Christoph Kluge
78494cd30e fix selection, add zero default 2023-12-05 17:33:30 +01:00
Christoph Kluge
ead5c54bcb Prototype completed 2023-12-05 15:30:40 +01:00
Christoph Kluge
b5b355c16c Finished backend sql query and gql resolve 2023-12-05 11:59:01 +01:00
Christoph Kluge
3067d7b250 fix: Use peak threshold for render limit maxy 2023-12-01 14:18:37 +01:00
Christoph Kluge
9bc36152d9 intermediate save
- DOES NOT COMPILE
2023-12-01 13:22:01 +01:00
Christoph Kluge
c1b944b838 sec: update dependencies 2023-11-29 14:25:12 +01:00
Christoph Kluge
175a88f1c4 Merge branch 'hotfix' into 196_add_footprint 2023-11-29 10:42:56 +01:00
Christoph Kluge
aac3e7d2f4 fix: fix scope autoselect on jobview statstable 2023-11-29 10:42:32 +01:00
Christoph Kluge
c0488b8cbe Update comments roofline 2023-11-29 10:40:59 +01:00
Christoph Kluge
d66703c4d0 update pckagelock 2023-11-29 10:40:25 +01:00
Christoph Kluge
173975aadd Add footprint select to user jobList 2023-11-28 09:58:36 +01:00
Christoph Kluge
d97fa37d2c feat: add footprint card displaying basic metrics 2023-11-27 10:07:13 +01:00
Christoph Kluge
782262b52e add missing package date-fns 2023-11-27 10:04:36 +01:00
Christoph Kluge
b8213ef6be Remove logs, reduce code 2023-11-24 17:22:06 +01:00
Christoph Kluge
e34623b1ce Add db average stats to gql, use in footprint 2023-11-24 15:11:38 +01:00
Christoph Kluge
4e375ff32b Handle accelerated and shared jobs 2023-11-24 10:36:22 +01:00
Christoph Kluge
f7529be3ea Add threshold scaling based on used resources
- required for shared jobs
2023-11-23 12:15:35 +01:00
Christoph Kluge
1aa9720405 Switch from title to sveltestrap tooltip 2023-11-22 12:12:36 +01:00
Christoph Kluge
709880ff5a Use html tag for metadata message
- remove old footprint version based on chartjs pie
2023-11-22 10:53:18 +01:00
Christoph Kluge
6b78b4e12b Adds message display in jobView 2023-11-21 15:38:57 +01:00
Christoph Kluge
f342a65aba Adds persistance to showfootprint selection 2023-11-21 15:38:28 +01:00
Christoph Kluge
dc860f8fd9 Handle artifacts, fix single node footprint flops 2023-11-21 10:27:16 +01:00
Christoph Kluge
f8f900151a Fix width, spacing, render 2023-11-20 18:08:33 +01:00
Christoph Kluge
8d409eed0f Footprint in jobList as selectable 2023-11-20 17:53:12 +01:00
Christoph Kluge
dc86523cce Add alternative ver with progress bars 2023-11-20 14:16:01 +01:00
Christoph Kluge
506d112cce Wording changes 2023-11-17 16:47:07 +01:00
Christoph Kluge
eb7f92282d add log, force node scope 2023-11-17 11:34:17 +01:00
Christoph Kluge
3468e987b6 Reformat footprintData mapping 2023-11-17 11:18:30 +01:00
Christoph Kluge
5acd9ece7f Adds messages to footprint 2023-11-16 18:31:45 +01:00
Christoph Kluge
8bc43baf2c Fix units and labels 2023-11-16 16:45:29 +01:00
Christoph Kluge
a2c99fb56d Add colors based on thresholds 2023-11-16 15:07:17 +01:00
Christoph Kluge
9689f95ea1 Initial implementaion 2023-11-16 12:49:20 +01:00
Christoph Kluge
84d6b48353 Fix: default values and new option for time filter 2023-11-15 15:03:58 +01:00
Christoph Kluge
bf64fc5213 Add completed state indicator 2023-11-13 13:43:44 +01:00
Christoph Kluge
d9f9c8aaf5 fix: retrigger gql api at manual refresh
- solves #221
2023-11-03 17:09:16 +01:00
2502989ca2 Refactor 2023-09-28 10:20:35 +02:00
ba7cc9168e feat: add automatic application detection and tagging 2023-09-28 10:20:20 +02:00
dc0d9fe038 Add more tags to test db 2023-09-27 15:01:08 +02:00
0e6c6937cd Merge branch 'master' into 134-job-tagging 2023-09-27 05:30:36 +02:00
Jan Eitzinger
280b16c11c Merge pull request #218 from ClusterCockpit/hotfix
Prepare bugfix release
2023-09-15 16:02:19 +02:00
4b922c575e Prepare bugfix release 2023-09-15 15:59:54 +02:00
Jan Eitzinger
09528ed6b9 Merge pull request #217 from ClusterCockpit/hotfix
fix: adapt roofline render to browser zoomlevel
2023-09-15 12:34:47 +02:00
Christoph Kluge
e61ff01518 fix: adapt roofline render to browser zoomlevel
- make roofline linewidth configurable
2023-09-15 11:09:01 +02:00
Jan Eitzinger
a4c68bf7fe Merge pull request #215 from ClusterCockpit/hotfix
Hotfix
2023-09-08 12:17:49 +02:00
bb1c8cc25d fix: Move name extract from token in else branch 2023-09-08 12:11:49 +02:00
4b06fa788d fix: Fix buggy logic and simplify code if ValidateUser enabled 2023-09-08 11:50:28 +02:00
Jan Eitzinger
ab08600486 Merge pull request #213 from ClusterCockpit/hotfix
Hotfix
2023-09-07 16:39:01 +02:00
7a5ccff6da fix: Remove port before IP check 2023-09-07 16:36:47 +02:00
a407a5cf01 Add note on apiAllowedIPs to Release Notes 2023-09-07 15:27:46 +02:00
2b3e2f25ec fix: Add correct duration string for max-age option 2023-09-07 15:25:22 +02:00
ed5ecbd914 fix: Restructure swagger docs 2023-09-07 15:14:09 +02:00
2d4759114e Add Release Notes link to release page 2023-09-07 14:33:22 +02:00
c68b9fec42 fix: Add documentation for apiAllowedIPs option 2023-09-07 14:03:41 +02:00
Jan Eitzinger
0f34c8cac6 Merge pull request #212 from ClusterCockpit/moebiusband73-patch-1
Update README.md
2023-09-06 14:02:16 +02:00
Jan Eitzinger
d388a45630 Update README.md 2023-09-06 13:56:55 +02:00
d839c53642 Add initial structure 2023-08-22 10:56:32 +02:00
416 changed files with 66004 additions and 32831 deletions

15
.github/dependabot.yml vendored Normal file
View File

@@ -0,0 +1,15 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
version: 2
updates:
- package-ecosystem: "gomod"
directory: "/"
schedule:
interval: "weekly"
- package-ecosystem: "npm"
directory: "/web/frontend"
schedule:
interval: "weekly"

View File

@@ -1,331 +0,0 @@
# See: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions
# Workflow name
name: Release
# Run on tag push
on:
push:
tags:
- '**'
jobs:
#
# Build on AlmaLinux 8.5 using golang-1.18.2
#
AlmaLinux-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:8.5
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
outputs:
rpm : ${{steps.rpmrename.outputs.RPM}}
srpm : ${{steps.rpmrename.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which npm
dnf --assumeyes install 'dnf-command(builddep)'
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
rpm -i go*.rpm
npm install --global yarn rollup svelte rollup-plugin-svelte
#dnf --assumeyes builddep build/package/cc-backend.spec
- name: RPM build ClusterCockpit
id: rpmbuild
run: make RPM
# AlmaLinux 8.5 is a derivate of RedHat Enterprise Linux 8 (UBI8),
# so the created RPM both contain the substring 'el8' in the RPM file names
# This step replaces the substring 'el8' to 'alma85'. It uses the move operation
# because it is unclear whether the default AlmaLinux 8.5 container contains the
# 'rename' command. This way we also get the new names for output.
- name: Rename RPMs (s/el8/alma85/)
id: rpmrename
run: |
OLD_RPM="${{steps.rpmbuild.outputs.RPM}}"
OLD_SRPM="${{steps.rpmbuild.outputs.SRPM}}"
NEW_RPM="${OLD_RPM/el8/alma85}"
NEW_SRPM=${OLD_SRPM/el8/alma85}
mv "${OLD_RPM}" "${NEW_RPM}"
mv "${OLD_SRPM}" "${NEW_SRPM}"
echo "::set-output name=SRPM::${NEW_SRPM}"
echo "::set-output name=RPM::${NEW_RPM}"
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend RPM for AlmaLinux 8.5
path: ${{ steps.rpmrename.outputs.RPM }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend SRPM for AlmaLinux 8.5
path: ${{ steps.rpmrename.outputs.SRPM }}
#
# Build on UBI 8 using golang-1.18.2
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: registry.access.redhat.com/ubi8/ubi:8.5-226.1645809065
# The job outputs link to the outputs of the 'rpmbuild' step
outputs:
rpm : ${{steps.rpmbuild.outputs.RPM}}
srpm : ${{steps.rpmbuild.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
rpm -i go*.rpm
dnf --assumeyes --disableplugin=subscription-manager install npm
npm install --global yarn rollup svelte rollup-plugin-svelte
#dnf --assumeyes builddep build/package/cc-backend.spec
- name: RPM build ClusterCockpit
id: rpmbuild
run: make RPM
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend RPM for UBI 8
path: ${{ steps.rpmbuild.outputs.RPM }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend SRPM for UBI 8
path: ${{ steps.rpmbuild.outputs.SRPM }}
#
# Build on Ubuntu 20.04 using official go 1.19.1 package
#
Ubuntu-focal-build:
runs-on: ubuntu-latest
container: ubuntu:20.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
apt --assume-yes install npm
npm install --global yarn rollup svelte rollup-plugin-svelte
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
- name: DEB build ClusterCockpit
id: dpkg-build
run: |
ls -la
pwd
env
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
git config --global --add safe.directory $(pwd)
make DEB
- name: Rename DEB (add '_ubuntu20.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "::set-output name=DEB::${NEW_DEB_FILE}"
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend DEB for Ubuntu 20.04
path: ${{ steps.debrename.outputs.DEB }}
#
# Build on Ubuntu 20.04 using official go 1.19.1 package
#
Ubuntu-jammy-build:
runs-on: ubuntu-latest
container: ubuntu:22.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash npm
npm install --global yarn rollup svelte rollup-plugin-svelte
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
- name: DEB build ClusterCockpit
id: dpkg-build
run: |
ls -la
pwd
env
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
git config --global --add safe.directory $(pwd)
make DEB
- name: Rename DEB (add '_ubuntu22.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu22.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "::set-output name=DEB::${NEW_DEB_FILE}"
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend DEB for Ubuntu 22.04
path: ${{ steps.debrename.outputs.DEB }}
#
# Create release with fresh RPMs
#
Release:
runs-on: ubuntu-latest
# We need the RPMs, so add dependency
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build, Ubuntu-jammy-build]
steps:
# See: https://github.com/actions/download-artifact
- name: Download AlmaLinux 8.5 RPM
uses: actions/download-artifact@v2
with:
name: cc-backend RPM for AlmaLinux 8.5
- name: Download AlmaLinux 8.5 SRPM
uses: actions/download-artifact@v2
with:
name: cc-backend SRPM for AlmaLinux 8.5
- name: Download UBI 8 RPM
uses: actions/download-artifact@v2
with:
name: cc-backend RPM for UBI 8
- name: Download UBI 8 SRPM
uses: actions/download-artifact@v2
with:
name: cc-backend SRPM for UBI 8
- name: Download Ubuntu 20.04 DEB
uses: actions/download-artifact@v2
with:
name: cc-backend DEB for Ubuntu 20.04
- name: Download Ubuntu 22.04 DEB
uses: actions/download-artifact@v2
with:
name: cc-backend DEB for Ubuntu 22.04
# The download actions do not publish the name of the downloaded file,
# so we re-use the job outputs of the parent jobs. The files are all
# downloaded to the current folder.
# The gh-release action afterwards does not accept file lists but all
# files have to be listed at 'files'. The step creates one output per
# RPM package (2 per distro)
- name: Set RPM variables
id: files
run: |
ALMA_85_RPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.rpm}}")
ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}")
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}")
U_2204_DEB=$(basename "${{ needs.Ubuntu-jammy-build.outputs.deb}}")
echo "ALMA_85_RPM::${ALMA_85_RPM}"
echo "ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "UBI_8_RPM::${UBI_8_RPM}"
echo "UBI_8_SRPM::${UBI_8_SRPM}"
echo "U_2004_DEB::${U_2004_DEB}"
echo "U_2204_DEB::${U_2204_DEB}"
echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}"
echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}"
echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}"
echo "::set-output name=U_2004_DEB::${U_2004_DEB}"
echo "::set-output name=U_2204_DEB::${U_2204_DEB}"
# See: https://github.com/softprops/action-gh-release
- name: Release
uses: softprops/action-gh-release@v1
if: startsWith(github.ref, 'refs/tags/')
with:
name: cc-backend-${{github.ref_name}}
files: |
${{ steps.files.outputs.ALMA_85_RPM }}
${{ steps.files.outputs.ALMA_85_SRPM }}
${{ steps.files.outputs.UBI_8_RPM }}
${{ steps.files.outputs.UBI_8_SRPM }}
${{ steps.files.outputs.U_2004_DEB }}
${{ steps.files.outputs.U_2204_DEB }}

View File

@@ -7,7 +7,7 @@ jobs:
- name: Install Go
uses: actions/setup-go@v4
with:
go-version: 1.19.x
go-version: 1.25.x
- name: Checkout code
uses: actions/checkout@v3
- name: Build, Vet & Test

29
.gitignore vendored
View File

@@ -1,19 +1,32 @@
/cc-backend
/var/job-archive
/var/*.db
/var/machine-state
/.env
/config.json
/uiConfig.json
/var/job-archive
/var/machine-state
/var/*.db-shm
/var/*.db-wal
/var/*.db
/var/*.txt
/var/checkpoints*
migrateTimestamps.pl
test_ccms_write_api*
/web/frontend/public/build
/web/frontend/node_modules
/.vscode/*
/archive-migration
/archive-manager
var/job.db-shm
var/job.db-wal
/internal/repository/testdata/job.db-shm
/internal/repository/testdata/job.db-wal
/.vscode/*
dist/
*.db
.idea
tools/archive-migration/archive-migration
tools/archive-manager/archive-manager

View File

@@ -34,19 +34,6 @@ builds:
main: ./tools/archive-manager
tags:
- static_build
- env:
- CGO_ENABLED=0
goos:
- linux
goarch:
- amd64
goamd64:
- v3
id: "archive-migration"
binary: archive-migration
main: ./tools/archive-migration
tags:
- static_build
- env:
- CGO_ENABLED=0
goos:
@@ -70,7 +57,7 @@ archives:
{{- else }}{{ .Arch }}{{ end }}
{{- if .Arm }}v{{ .Arm }}{{ end }}
checksum:
name_template: 'checksums.txt'
name_template: "checksums.txt"
snapshot:
name_template: "{{ incpatch .Version }}-next"
changelog:
@@ -100,6 +87,7 @@ changelog:
release:
draft: false
footer: |
Supports job archive version 1 and database version 6.
Supports job archive version 2 and database version 8.
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md) for further details on breaking changes.
# vim: set ts=2 sw=2 tw=0 fo=cnqoj

26
AGENTS.md Normal file
View File

@@ -0,0 +1,26 @@
# ClusterCockpit Backend - Agent Guidelines
## Build/Test Commands
- Build: `make` or `go build ./cmd/cc-backend`
- Run all tests: `make test` (runs: `go clean -testcache && go build ./... && go vet ./... && go test ./...`)
- Run single test: `go test -run TestName ./path/to/package`
- Run single test file: `go test ./path/to/package -run TestName`
- Frontend build: `cd web/frontend && npm install && npm run build`
- Generate GraphQL: `make graphql` (uses gqlgen)
- Generate Swagger: `make swagger` (uses swaggo/swag)
## Code Style
- **Formatting**: Use `gofumpt` for all Go files (strict requirement)
- **Copyright header**: All files must include copyright header (see existing files)
- **Package docs**: Document packages with comprehensive package-level comments explaining purpose, usage, configuration
- **Imports**: Standard library first, then external packages, then internal packages (grouped with blank lines)
- **Naming**: Use camelCase for private, PascalCase for exported; descriptive names (e.g., `JobRepository`, `handleError`)
- **Error handling**: Return errors, don't panic; use custom error types where appropriate; log with cclog package
- **Logging**: Use `cclog` package (e.g., `cclog.Errorf()`, `cclog.Warnf()`, `cclog.Debugf()`)
- **Testing**: Use standard `testing` package; use `testify/assert` for assertions; name tests `TestFunctionName`
- **Comments**: Document all exported functions/types with godoc-style comments
- **Structs**: Document fields with inline comments, especially for complex configurations
- **HTTP handlers**: Return proper status codes; use `handleError()` helper for consistent error responses
- **JSON**: Use struct tags for JSON marshaling; `DisallowUnknownFields()` for strict decoding

215
CLAUDE.md Normal file
View File

@@ -0,0 +1,215 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with
code in this repository.
## Project Overview
ClusterCockpit is a job-specific performance monitoring framework for HPC
clusters. This is a Golang backend that provides REST and GraphQL APIs, serves a
Svelte-based frontend, and manages job archives and metric data from various
time-series databases.
## Build and Development Commands
### Building
```bash
# Build everything (frontend + backend)
make
# Build only the frontend
make frontend
# Build only the backend (requires frontend to be built first)
go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.4.4 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend
```
### Testing
```bash
# Run all tests
make test
# Run tests with verbose output
go test -v ./...
# Run tests for a specific package
go test ./internal/repository
```
### Code Generation
```bash
# Regenerate GraphQL schema and resolvers (after modifying api/*.graphqls)
make graphql
# Regenerate Swagger/OpenAPI docs (after modifying API comments)
make swagger
```
### Frontend Development
```bash
cd web/frontend
# Install dependencies
npm install
# Build for production
npm run build
# Development mode with watch
npm run dev
```
### Running
```bash
# Initialize database and create admin user
./cc-backend -init-db -add-user demo:admin:demo
# Start server in development mode (enables GraphQL Playground and Swagger UI)
./cc-backend -server -dev -loglevel info
# Start demo with sample data
./startDemo.sh
```
## Architecture
### Backend Structure
The backend follows a layered architecture with clear separation of concerns:
- **cmd/cc-backend**: Entry point, orchestrates initialization of all subsystems
- **internal/repository**: Data access layer using repository pattern
- Abstracts database operations (SQLite3 only)
- Implements LRU caching for performance
- Provides repositories for Job, User, Node, and Tag entities
- Transaction support for batch operations
- **internal/api**: REST API endpoints (Swagger/OpenAPI documented)
- **internal/graph**: GraphQL API (uses gqlgen)
- Schema in `api/*.graphqls`
- Generated code in `internal/graph/generated/`
- Resolvers in `internal/graph/schema.resolvers.go`
- **internal/auth**: Authentication layer
- Supports local accounts, LDAP, OIDC, and JWT tokens
- Implements rate limiting for login attempts
- **internal/metricdata**: Metric data repository abstraction
- Pluggable backends: cc-metric-store, Prometheus, InfluxDB
- Each cluster can have a different metric data backend
- **internal/archiver**: Job archiving to file-based archive
- **pkg/archive**: Job archive backend implementations
- File system backend (default)
- S3 backend
- SQLite backend (experimental)
- **pkg/nats**: NATS integration for metric ingestion
### Frontend Structure
- **web/frontend**: Svelte 5 application
- Uses Rollup for building
- Components organized by feature (analysis, job, user, etc.)
- GraphQL client using @urql/svelte
- Bootstrap 5 + SvelteStrap for UI
- uPlot for time-series visualization
- **web/templates**: Server-side Go templates
### Key Concepts
**Job Archive**: Completed jobs are stored in a file-based archive following the
[ClusterCockpit job-archive
specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
Each job has a `meta.json` file with metadata and metric data files.
**Metric Data Repositories**: Time-series metric data is stored separately from
job metadata. The system supports multiple backends (cc-metric-store is
recommended). Configuration is per-cluster in `config.json`.
**Authentication Flow**:
1. Multiple authenticators can be configured (local, LDAP, OIDC, JWT)
2. Each authenticator's `CanLogin` method is called to determine if it should handle the request
3. The first authenticator that returns true performs the actual `Login`
4. JWT tokens are used for API authentication
**Database Migrations**: SQL migrations in `internal/repository/migrations/` are
applied automatically on startup. Version tracking in `version` table.
**Scopes**: Metrics can be collected at different scopes:
- Node scope (always available)
- Core scope (for jobs with ≤8 nodes)
- Accelerator scope (for GPU/accelerator metrics)
## Configuration
- **config.json**: Main configuration (clusters, metric repositories, archive settings)
- **.env**: Environment variables (secrets like JWT keys)
- Copy from `configs/env-template.txt`
- NEVER commit this file
- **cluster.json**: Cluster topology and metric definitions (loaded from archive or config)
## Database
- Default: SQLite 3 (`./var/job.db`)
- Connection managed by `internal/repository`
- Schema version in `internal/repository/migration.go`
## Code Generation
**GraphQL** (gqlgen):
- Schema: `api/*.graphqls`
- Config: `gqlgen.yml`
- Generated code: `internal/graph/generated/`
- Custom resolvers: `internal/graph/schema.resolvers.go`
- Run `make graphql` after schema changes
**Swagger/OpenAPI**:
- Annotations in `internal/api/*.go`
- Generated docs: `api/docs.go`, `api/swagger.yaml`
- Run `make swagger` after API changes
## Testing Conventions
- Test files use `_test.go` suffix
- Test data in `testdata/` subdirectories
- Repository tests use in-memory SQLite
- API tests use httptest
## Common Workflows
### Adding a new GraphQL field
1. Edit schema in `api/*.graphqls`
2. Run `make graphql`
3. Implement resolver in `internal/graph/schema.resolvers.go`
### Adding a new REST endpoint
1. Add handler in `internal/api/*.go`
2. Add route in `internal/api/rest.go`
3. Add Swagger annotations
4. Run `make swagger`
### Adding a new metric data backend
1. Implement `MetricDataRepository` interface in `internal/metricdata/`
2. Register in `metricdata.Init()` switch statement
3. Update config.json schema documentation
### Modifying database schema
1. Create new migration in `internal/repository/migrations/`
2. Increment `repository.Version`
3. Test with fresh database and existing database
## Dependencies
- Go 1.24.0+ (check go.mod for exact version)
- Node.js (for frontend builds)
- SQLite 3 (only supported database)
- Optional: NATS server for metric ingestion

View File

@@ -1,6 +1,6 @@
MIT License
Copyright (c) 2022 NHR@FAU, University Erlangen-Nuremberg
Copyright (c) NHR@FAU, University Erlangen-Nuremberg
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

136
Makefile
View File

@@ -1,8 +1,6 @@
TARGET = ./cc-backend
VAR = ./var
CFG = config.json .env
FRONTEND = ./web/frontend
VERSION = 1.2.0
VERSION = 1.4.4
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
@@ -22,17 +20,27 @@ SVELTE_COMPONENTS = status \
header
SVELTE_TARGETS = $(addprefix $(FRONTEND)/public/build/,$(addsuffix .js, $(SVELTE_COMPONENTS)))
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
$(wildcard $(FRONTEND)/src/*.js) \
$(wildcard $(FRONTEND)/src/filters/*.svelte) \
$(wildcard $(FRONTEND)/src/plots/*.svelte) \
$(wildcard $(FRONTEND)/src/joblist/*.svelte)
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
$(wildcard $(FRONTEND)/src/*.js) \
$(wildcard $(FRONTEND)/src/analysis/*.svelte) \
$(wildcard $(FRONTEND)/src/config/*.svelte) \
$(wildcard $(FRONTEND)/src/config/admin/*.svelte) \
$(wildcard $(FRONTEND)/src/config/user/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/*.js) \
$(wildcard $(FRONTEND)/src/generic/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/filters/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/plots/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/joblist/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/helper/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/select/*.svelte) \
$(wildcard $(FRONTEND)/src/header/*.svelte) \
$(wildcard $(FRONTEND)/src/job/*.svelte)
.PHONY: clean distclean test tags frontend $(TARGET)
.PHONY: clean distclean test tags frontend swagger graphql $(TARGET)
.NOTPARALLEL:
$(TARGET): $(VAR) $(CFG) $(SVELTE_TARGETS)
$(TARGET): $(SVELTE_TARGETS)
$(info ===> BUILD cc-backend)
@go build -ldflags=${LD_FLAGS} ./cmd/cc-backend
@@ -40,6 +48,15 @@ frontend:
$(info ===> BUILD frontend)
cd web/frontend && npm install && npm run build
swagger:
$(info ===> GENERATE swagger)
@go tool github.com/swaggo/swag/cmd/swag init --parseDependency -d ./internal/api -g rest.go -o ./api
@mv ./api/docs.go ./internal/api/docs.go
graphql:
$(info ===> GENERATE graphql)
@go tool github.com/99designs/gqlgen
clean:
$(info ===> CLEAN)
@go clean
@@ -49,7 +66,7 @@ distclean:
@$(MAKE) clean
$(info ===> DISTCLEAN)
@rm -rf $(FRONTEND)/node_modules
@rm -rf $(VAR)
@rm -rf ./var
test:
$(info ===> TESTING)
@@ -63,103 +80,8 @@ tags:
@ctags -R
$(VAR):
@mkdir $(VAR)
config.json:
$(info ===> Initialize config.json file)
@cp configs/config.json config.json
.env:
$(info ===> Initialize .env file)
@cp configs/env-template.txt .env
@mkdir -p $(VAR)
$(SVELTE_TARGETS): $(SVELTE_SRC)
$(info ===> BUILD frontend)
cd web/frontend && npm install && npm run build
install: $(TARGET)
@WORKSPACE=$(PREFIX)
@if [ -z "$${WORKSPACE}" ]; then exit 1; fi
@mkdir --parents --verbose $${WORKSPACE}/usr/$(BINDIR)
@install -Dpm 755 $(TARGET) $${WORKSPACE}/usr/$(BINDIR)/$(TARGET)
@install -Dpm 600 configs/config.json $${WORKSPACE}/etc/$(TARGET)/$(TARGET).json
.ONESHELL:
.PHONY: RPM
RPM: build/package/cc-backend.spec
@WORKSPACE="$${PWD}"
@SPECFILE="$${WORKSPACE}/build/package/cc-backend.spec"
# Setup RPM build tree
@eval $$(rpm --eval "ARCH='%{_arch}' RPMDIR='%{_rpmdir}' SOURCEDIR='%{_sourcedir}' SPECDIR='%{_specdir}' SRPMDIR='%{_srcrpmdir}' BUILDDIR='%{_builddir}'")
@mkdir --parents --verbose "$${RPMDIR}" "$${SOURCEDIR}" "$${SPECDIR}" "$${SRPMDIR}" "$${BUILDDIR}"
# Create source tarball
@COMMITISH="HEAD"
@VERS=$$(git describe --tags $${COMMITISH})
@VERS=$${VERS#v}
@VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g)
@if [ "$${VERS}" = "" ]; then VERS="$(VERSION)"; fi
@eval $$(rpmspec --query --queryformat "NAME='%{name}' VERSION='%{version}' RELEASE='%{release}' NVR='%{NVR}' NVRA='%{NVRA}'" --define="VERS $${VERS}" "$${SPECFILE}")
@PREFIX="$${NAME}-$${VERSION}"
@FORMAT="tar.gz"
@SRCFILE="$${SOURCEDIR}/$${PREFIX}.$${FORMAT}"
@git archive --verbose --format "$${FORMAT}" --prefix="$${PREFIX}/" --output="$${SRCFILE}" $${COMMITISH}
# Build RPM and SRPM
@rpmbuild -ba --define="VERS $${VERS}" --rmsource --clean "$${SPECFILE}"
# Report RPMs and SRPMs when in GitHub Workflow
@if [ "$${GITHUB_ACTIONS}" = true ]; then
@ RPMFILE="$${RPMDIR}/$${ARCH}/$${NVRA}.rpm"
@ SRPMFILE="$${SRPMDIR}/$${NVR}.src.rpm"
@ echo "RPM: $${RPMFILE}"
@ echo "SRPM: $${SRPMFILE}"
@ echo "::set-output name=SRPM::$${SRPMFILE}"
@ echo "::set-output name=RPM::$${RPMFILE}"
@fi
.ONESHELL:
.PHONY: DEB
DEB: build/package/cc-backend.deb.control
@BASEDIR=$${PWD}
@WORKSPACE=$${PWD}/.dpkgbuild
@DEBIANDIR=$${WORKSPACE}/debian
@DEBIANBINDIR=$${WORKSPACE}/DEBIAN
@mkdir --parents --verbose $$WORKSPACE $$DEBIANBINDIR
#@mkdir --parents --verbose $$DEBIANDIR
@CONTROLFILE="$${BASEDIR}/build/package/cc-backend.deb.control"
@COMMITISH="HEAD"
@VERS=$$(git describe --tags --abbrev=0 $${COMMITISH})
@VERS=$${VERS#v}
@VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g)
@if [ "$${VERS}" = "" ]; then VERS="$(VERSION)"; fi
@ARCH=$$(uname -m)
@ARCH=$$(echo $$ARCH | sed -e s+'_'+'-'+g)
@if [ "$${ARCH}" = "x86-64" ]; then ARCH=amd64; fi
@PREFIX="$${NAME}-$${VERSION}_$${ARCH}"
@SIZE_BYTES=$$(du -bcs --exclude=.dpkgbuild "$${WORKSPACE}"/ | awk '{print $$1}' | head -1 | sed -e 's/^0\+//')
@SIZE="$$(awk -v size="$$SIZE_BYTES" 'BEGIN {print (size/1024)+1}' | awk '{print int($$0)}')"
#@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANDIR}/control
@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANBINDIR}/control
@mkdir --parents --verbose "$${WORKSPACE}"/$(VAR)
@touch "$${WORKSPACE}"/$(VAR)/job.db
@cd web/frontend && yarn install && yarn build && cd -
@go build -ldflags=${LD_FLAGS} ./cmd/cc-backend
@mkdir --parents --verbose $${WORKSPACE}/usr/$(BINDIR)
@cp $(TARGET) $${WORKSPACE}/usr/$(BINDIR)/$(TARGET)
@chmod 0755 $${WORKSPACE}/usr/$(BINDIR)/$(TARGET)
@mkdir --parents --verbose $${WORKSPACE}/etc/$(TARGET)
@cp configs/config.json $${WORKSPACE}/etc/$(TARGET)/$(TARGET).json
@chmod 0600 $${WORKSPACE}/etc/$(TARGET)/$(TARGET).json
@mkdir --parents --verbose $${WORKSPACE}/usr/lib/systemd/system
@cp build/package/$(TARGET).service $${WORKSPACE}/usr/lib/systemd/system/$(TARGET).service
@chmod 0644 $${WORKSPACE}/usr/lib/systemd/system/$(TARGET).service
@mkdir --parents --verbose $${WORKSPACE}/etc/default
@cp build/package/$(TARGET).config $${WORKSPACE}/etc/default/$(TARGET)
@chmod 0600 $${WORKSPACE}/etc/default/$(TARGET)
@mkdir --parents --verbose $${WORKSPACE}/usr/lib/sysusers.d
@cp build/package/$(TARGET).sysusers $${WORKSPACE}/usr/lib/sysusers.d/$(TARGET).conf
@chmod 0644 $${WORKSPACE}/usr/lib/sysusers.d/$(TARGET).conf
@DEB_FILE="cc-metric-store_$${VERS}_$${ARCH}.deb"
@dpkg-deb -b $${WORKSPACE} "$$DEB_FILE"
@rm -r "$${WORKSPACE}"
@if [ "$${GITHUB_ACTIONS}" = "true" ]; then
@ echo "::set-output name=DEB::$${DEB_FILE}"
@fi

203
README.md
View File

@@ -1,9 +1,19 @@
# NOTE
While we do our best to keep the master branch in a usable state, there is no guarantee the master branch works.
Please do not use it for production!
Please have a look at the [Release
Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md)
for breaking changes!
# ClusterCockpit REST and GraphQL API backend
[![Build](https://github.com/ClusterCockpit/cc-backend/actions/workflows/test.yml/badge.svg)](https://github.com/ClusterCockpit/cc-backend/actions/workflows/test.yml)
This is a Golang backend implementation for a REST and GraphQL API according to
the [ClusterCockpit specifications](https://github.com/ClusterCockpit/cc-specifications). It also
the [ClusterCockpit
specifications](https://github.com/ClusterCockpit/cc-specifications). It also
includes a web interface for ClusterCockpit. This implementation replaces the
previous PHP Symfony based ClusterCockpit web interface. The reasons for
switching from PHP Symfony to a Golang based solution are explained
@@ -11,31 +21,30 @@ switching from PHP Symfony to a Golang based solution are explained
## Overview
This is a Golang web backend for the ClusterCockpit job-specific performance
monitoring framework. It provides a REST API for integrating ClusterCockpit with
an HPC cluster batch system and external analysis scripts. Data exchange between
the web front-end and the back-end is based on a GraphQL API. The web frontend
is also served by the backend using [Svelte](https://svelte.dev/) components.
Layout and styling are based on [Bootstrap 5](https://getbootstrap.com/) using
[Bootstrap Icons](https://icons.getbootstrap.com/).
This is a Golang web backend for the ClusterCockpit job-specific performance monitoring framework.
It provides a REST API for integrating ClusterCockpit with an HPC cluster batch system and external analysis scripts.
Data exchange between the web front-end and the back-end is based on a GraphQL API.
The web frontend is also served by the backend using [Svelte](https://svelte.dev/) components.
Layout and styling are based on [Bootstrap 5](https://getbootstrap.com/) using [Bootstrap Icons](https://icons.getbootstrap.com/).
The backend uses [SQLite 3](https://sqlite.org/) as a relational SQL database by default.
Optionally it can use a MySQL/MariaDB database server.
While there are metric data backends for the InfluxDB and Prometheus time series databases, the only tested and supported setup is to use cc-metric-store as the metric data backend.
Documentation on how to integrate ClusterCockpit with other time series databases will be added in the future.
The backend uses [SQLite 3](https://sqlite.org/) as the relational SQL database.
While there are metric data backends for the InfluxDB and Prometheus time series
databases, the only tested and supported setup is to use cc-metric-store as the
metric data backend. Documentation on how to integrate ClusterCockpit with other
time series databases will be added in the future.
Completed batch jobs are stored in a file-based job archive according to
[this specification] (https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
[this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
The backend supports authentication via local accounts, an external LDAP
directory, and JWT tokens. Authorization for APIs is implemented with
[JWT](https://jwt.io/) tokens created with public/private key encryption.
You find more detailed information here:
* `./configs/README.md`: Infos about configuration and setup of cc-backend.
* `./init/README.md`: Infos on how to setup cc-backend as systemd service on Linux.
* `./tools/README.md`: Infos on the JWT authorizatin token workflows in ClusterCockpit.
* `./docs`: You can find further documentation here. There is also a Hands-on tutorial that is recommended to get familiar with the ClusterCockpit setup.
You find a detailed documentation on the [ClusterCockpit
Webpage](https://clustercockpit.org).
**NOTE**
## Build requirements
ClusterCockpit requires a current version of the golang toolchain and node.js.
You can check `go.mod` to see what is the current minimal golang version needed.
@@ -46,7 +55,7 @@ on the Go standard library, it is crucial for security and performance to use a
current version of golang. In addition, an old golang toolchain may limit the supported
versions of third-party packages.
## How to try ClusterCockpit with a demo setup.
## How to try ClusterCockpit with a demo setup
We provide a shell script that downloads demo data and automatically starts the
cc-backend. You will need `wget`, `go`, `node`, `npm` in your path to
@@ -58,31 +67,37 @@ cd ./cc-backend
./startDemo.sh
```
You can also try the demo using the lates release binary.
You can also try the demo using the latest release binary.
Create a folder and put the release binary `cc-backend` into this folder.
Execute the following steps:
```
$ ./cc-backend -init
$ vim config.json (Add a second cluster entry and name the clusters alex and fritz)
$ wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar
$ tar xf job-archive-demo.tar
$ ./cc-backend -init-db -add-user demo:admin:demo -loglevel info
$ ./cc-backend -server -dev -loglevel info
```shell
./cc-backend -init
vim config.json (Add a second cluster entry and name the clusters alex and fritz)
wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar
tar xf job-archive-demo.tar
./cc-backend -init-db -add-user demo:admin:demo -loglevel info
./cc-backend -server -dev -loglevel info
```
You can access the web interface at http://localhost:8080.
You can access the web interface at [http://localhost:8080](http://localhost:8080).
Credentials for login are `demo:demo`.
Please note that some views do not work without a metric backend (e.g., the
Analysis, Systems and Status views).
## Howto build and run
## How to build and run
There is a Makefile to automate the build of cc-backend. The Makefile supports the following targets:
* `$ make`: Initialize `var` directory and build svelte frontend and backend binary. Note that there is no proper prerequesite handling. Any change of frontend source files will result in a complete rebuild.
* `$ make clean`: Clean go build cache and remove binary.
* `$ make test`: Run the tests that are also run in the GitHub workflow setup.
There is a Makefile to automate the build of cc-backend. The Makefile supports
the following targets:
- `make`: Initialize `var` directory and build svelte frontend and backend
binary. Note that there is no proper prerequisite handling. Any change of
frontend source files will result in a complete rebuild.
- `make clean`: Clean go build cache and remove binary.
- `make test`: Run the tests that are also run in the GitHub workflow setup.
A common workflow for setting up cc-backend from scratch is:
```sh
git clone https://github.com/ClusterCockpit/cc-backend.git
@@ -113,89 +128,43 @@ ln -s <your-existing-job-archive> ./var/job-archive
./cc-backend -help
```
### Run as systemd daemon
To run this program as a daemon, cc-backend comes with a [example systemd setup](./init/README.md).
## Configuration and setup
cc-backend can be used as a local web interface for an existing job archive or
as a server for the ClusterCockpit monitoring framework.
Create your job archive according to [this specification] (https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
At least one cluster directory with a valid `cluster.json` file is required. If
you configure the job archive from scratch, you must also create the job
archive version file that contains the job archive version as an integer.
You can retrieve the currently supported version by running the following
command:
```
$ ./cc-backend -version
```
It is ok to have no jobs in the job archive.
### Configuration
A configuration file in JSON format must be specified with `-config` to override the default settings.
By default, a `config.json` file located in the current directory of the `cc-backend` process will be loaded even without the `-config` flag.
Documentation of all supported configuration and command line options can be found [here](./configs/README.md).
## Database initialization and migration
Each `cc-backend` version supports a specific database version.
At startup, the version of the sqlite database is checked and `cc-backend` terminates if the version does not match.
`cc-backend` supports the migration of the database schema to the required version with the command line option `-migrate-db`.
If the database file does not exist yet, it will be created and initialized with the command line option `-migrate-db`.
If you want to use a newer database version with an older version of cc-backend, you can downgrade a database with the external tool [migrate](https://github.com/golang-migrate/migrate).
In this case, you must specify the path to the migration files in a current source tree: `./internal/repository/migrations/`.
## Development and testing
When making changes to the REST or GraphQL API, the appropriate code generators must be used.
You must always rebuild `cc-backend` after updating the API files.
### Update GraphQL schema
This project uses [gqlgen](https://github.com/99designs/gqlgen) for the GraphQL API.
The schema can be found in `./api/schema.graphqls`.
After changing it, you need to run `go run github.com/99designs/gqlgen`, which will update `./internal/graph/model`.
If new resolvers are needed, they will be added to `./internal/graph/schema.resolvers.go`, where you will then need to implement them.
If you start `cc-backend` with the `-dev` flag, the GraphQL Playground UI is available at http://localhost:8080/playground.
### Update Swagger UI
This project integrates [swagger ui] (https://swagger.io/tools/swagger-ui/) to document and test its REST API.
The swagger documentation files can be found in `./api/`.
You can generate the swagger-ui configuration by running `go run github.com/swaggo/swag/cmd/swag init -d ./internal/api,./pkg/schema -g rest.go -o ./api `.
You need to move the created `./api/docs.go` to `./internal/api/docs.go`.
If you start cc-backend with the `-dev` flag, the Swagger interface is available
at http://localhost:8080/swagger/.
You must enter a JWT key for a user with the API role.
**NOTE**
The user who owns the JWT key must not be logged into the same browser (have a
running session), or the Swagger requests will not work. It is recommended to
create a separate user that has only the API role.
## Development and testing
In case the REST or GraphQL API is changed the according code generators have to be used.
## Project file structure
- [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api) contains the API schema files for the REST and GraphQL APIs. The REST API is documented in the OpenAPI 3.0 format in [./api/openapi.yaml](./api/openapi.yaml).
- [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend) contains `main.go` for the main application.
- [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs) contains documentation about configuration and command line options and required environment variables. A sample configuration file is provided.
- [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs) contains more in-depth documentation.
- [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init) contains an example of setting up systemd for production use.
- [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal) contains library source code that is not intended for use by others.
- [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg) contains Go packages that can be used by other projects.
- [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools) Additional command line helper tools.
- [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager) Commands for getting infos about and existing job archive.
- [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration) Tool to migrate from previous to current job archive version.
- [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey) Tool to convert external pubkey for use in `cc-backend`.
- [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair) contains a small application to generate a compatible JWT keypair. You find documentation on how to use it [here](https://github.com/ClusterCockpit/cc-backend/blob/master/docs/JWT-Handling.md).
- [`web/`](https://github.com/ClusterCockpit/cc-backend/tree/master/web) Server-side templates and frontend-related files:
- [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend) Svelte components and static assets for the frontend UI
- [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates) Server-side Go templates
- [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml) Configures the behaviour and generation of [gqlgen](https://github.com/99designs/gqlgen).
- [`startDemo.sh`](https://github.com/ClusterCockpit/cc-backend/blob/master/startDemo.sh) is a shell script that sets up demo data, and builds and starts `cc-backend`.
- [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
contains the API schema files for the REST and GraphQL APIs. The REST API is
documented in the OpenAPI 3.0 format in
[./api/openapi.yaml](./api/openapi.yaml).
- [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
contains `main.go` for the main application.
- [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
contains documentation about configuration and command line options and required
environment variables. A sample configuration file is provided.
- [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs)
contains more in-depth documentation.
- [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
contains an example of setting up systemd for production use.
- [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
contains library source code that is not intended for use by others.
- [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
contains Go packages that can be used by other projects.
- [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
Additional command line helper tools.
- [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
Commands for getting infos about and existing job archive.
- [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
Tool to convert external pubkey for use in `cc-backend`.
- [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
contains a small application to generate a compatible JWT keypair. You find
documentation on how to use it
[here](https://github.com/ClusterCockpit/cc-backend/blob/master/docs/JWT-Handling.md).
- [`web/`](https://github.com/ClusterCockpit/cc-backend/tree/master/web)
Server-side templates and frontend-related files:
- [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
Svelte components and static assets for the frontend UI
- [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
Server-side Go templates
- [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
Configures the behaviour and generation of
[gqlgen](https://github.com/99designs/gqlgen).
- [`startDemo.sh`](https://github.com/ClusterCockpit/cc-backend/blob/master/startDemo.sh)
is a shell script that sets up demo data, and builds and starts `cc-backend`.

View File

@@ -1,36 +1,47 @@
# `cc-backend` version 1.2.0
# `cc-backend` version 1.4.4
Supports job archive version 1 and database version 6.
Supports job archive version 2 and database version 8.
This is a minor release of `cc-backend`, the API backend and frontend
This is a bug fix release of `cc-backend`, the API backend and frontend
implementation of ClusterCockpit.
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
** Breaking changes **
## Breaking changes
* The LDAP configuration option user_filter was changed and now should not include
the uid wildcard. Example:
- Old: `"user_filter": "(&(objectclass=posixAccount)(uid=*))"`
- New: `"user_filter": "(&(objectclass=posixAccount))"`
The option `apiAllowedIPs` is now a required configuration attribute in
`config.json`. This option restricts access to the admin API.
* The aggregate job statistic core hours is now computed using the job table
column `num_hwthreads`. In a future release this column will be renamed to
`num_cores`. For correct display of core hours `num_hwthreads` must be correctly
filled on job start. If your existing jobs do not provide the correct value in
this column then you can set this with one SQL INSERT statement. This only applies
if you have exclusive jobs, only. Please be aware that we treat this column as
it is the number of cores. In case you have SMT enabled and `num_hwthreads`
is not the number of cores the core hours will be too high by a factor!
To retain the previous behavior that the API is per default accessible from
everywhere set:
* The jwts key is now mandatory in config.json. It has to set max-age for
validity. Some key names have changed, please refer to
[config documentation](./configs/README.md) for details.
** NOTE **
If you are using the sqlite3 backend the `PRAGMA` option `foreign_keys` must be
explicitly set to ON. If using the sqlite3 console it is per default set to
OFF! On every console session you must set:
```json
"apiAllowedIPs": [
"*"
]
```
sqlite> PRAGMA foreign_keys = ON;
```
Otherwise if you delete jobs the jobtag relation table will not be updated accordingly!
## Breaking changes for minor release 1.4.x
- You need to perform a database migration. Depending on your database size the
migration might require several hours!
- You need to adapt the `cluster.json` configuration files in the job-archive,
add new required attributes to the metric list and after that edit
`./job-archive/version.txt` to version 2. Only metrics that have the footprint
attribute set can be filtered and show up in the footprint UI and polar plot.
- Continuous scrolling is default now in all job lists. You can change this back
to paging globally, also every user can configure to use paging or continuous
scrolling individually.
- Tags have a scope now. Existing tags will get global scope in the database
migration.
## New features
- Enable to delete tags from the web interface
## Known issues
- Currently energy footprint metrics of type energy are ignored for calculating
total energy.
- Resampling for running jobs only works with cc-metric-store
- With energy footprint metrics of type power the unit is ignored and it is
assumed the metric has the unit Watt.

View File

@@ -4,133 +4,222 @@ scalar Any
scalar NullableFloat
scalar MetricScope
scalar JobState
scalar SchedulerState
scalar MonitoringState
type Node {
id: ID!
hostname: String!
cluster: String!
subCluster: String!
jobsRunning: Int!
cpusAllocated: Int
memoryAllocated: Int
gpusAllocated: Int
schedulerState: SchedulerState!
healthState: MonitoringState!
metaData: Any
}
type NodeStates {
state: String!
count: Int!
}
type NodeStatesTimed {
state: String!
counts: [Int!]!
times: [Int!]!
}
type Job {
id: ID!
jobId: Int!
user: String!
project: String!
cluster: String!
subCluster: String!
startTime: Time!
duration: Int!
walltime: Int!
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
SMT: Int!
exclusive: Int!
partition: String!
arrayJobId: Int!
id: ID!
jobId: Int!
user: String!
project: String!
cluster: String!
subCluster: String!
startTime: Time!
duration: Int!
walltime: Int!
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
energy: Float!
SMT: Int!
shared: String!
partition: String!
arrayJobId: Int!
monitoringStatus: Int!
state: JobState!
tags: [Tag!]!
resources: [Resource!]!
concurrentJobs: JobLinkResultList
metaData: Any
userData: User
state: JobState!
tags: [Tag!]!
resources: [Resource!]!
concurrentJobs: JobLinkResultList
footprint: [FootprintValue]
energyFootprint: [EnergyFootprintValue]
metaData: Any
userData: User
}
type JobLink {
id: ID!
jobId: Int!
id: ID!
jobId: Int!
}
type Cluster {
name: String!
partitions: [String!]! # Slurm partitions
metricConfig: [MetricConfig!]!
subClusters: [SubCluster!]! # Hardware partitions/subclusters
name: String!
partitions: [String!]! # Slurm partitions
subClusters: [SubCluster!]! # Hardware partitions/subclusters
}
type SubCluster {
name: String!
nodes: String!
numberOfNodes: Int!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: MetricValue!
flopRateSimd: MetricValue!
name: String!
nodes: String!
numberOfNodes: Int!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: MetricValue!
flopRateSimd: MetricValue!
memoryBandwidth: MetricValue!
topology: Topology!
topology: Topology!
metricConfig: [MetricConfig!]!
footprint: [String!]!
}
type FootprintValue {
name: String!
stat: String!
value: Float!
}
type EnergyFootprintValue {
hardware: String!
metric: String!
value: Float!
}
type MetricValue {
name: String
unit: Unit!
value: Float!
}
type Topology {
node: [Int!]
socket: [[Int!]!]
node: [Int!]
socket: [[Int!]!]
memoryDomain: [[Int!]!]
die: [[Int!]!]
core: [[Int!]!]
die: [[Int!]!]
core: [[Int!]!]
accelerators: [Accelerator!]
}
type Accelerator {
id: String!
type: String!
id: String!
type: String!
model: String!
}
type SubClusterConfig {
name: String!
peak: Float
normal: Float
name: String!
peak: Float
normal: Float
caution: Float
alert: Float
remove: Boolean
alert: Float
remove: Boolean
}
type MetricConfig {
name: String!
unit: Unit!
scope: MetricScope!
name: String!
unit: Unit!
scope: MetricScope!
aggregation: String!
timestep: Int!
peak: Float!
normal: Float
timestep: Int!
peak: Float!
normal: Float
caution: Float!
alert: Float!
alert: Float!
lowerIsBetter: Boolean
subClusters: [SubClusterConfig!]!
}
type Tag {
id: ID!
id: ID!
type: String!
name: String!
scope: String!
}
type Resource {
hostname: String!
hwthreads: [Int!]
accelerators: [String!]
hostname: String!
hwthreads: [Int!]
accelerators: [String!]
configuration: String
}
type JobMetricWithName {
name: String!
scope: MetricScope!
name: String!
scope: MetricScope!
metric: JobMetric!
}
type ClusterMetricWithName {
name: String!
unit: Unit
timestep: Int!
data: [NullableFloat!]!
}
type JobMetric {
unit: Unit
timestep: Int!
series: [Series!]
unit: Unit
timestep: Int!
series: [Series!]
statisticsSeries: StatsSeries
}
type Series {
hostname: String!
id: String
hostname: String!
id: String
statistics: MetricStatistics
data: [NullableFloat!]!
data: [NullableFloat!]!
}
type StatsSeries {
mean: [NullableFloat!]!
median: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type NamedStatsWithScope {
name: String!
scope: MetricScope!
stats: [ScopedStats!]!
}
type ScopedStats {
hostname: String!
id: String
data: MetricStatistics!
}
type JobStats {
id: Int!
jobId: String!
startTime: Int!
duration: Int!
cluster: String!
subCluster: String!
numNodes: Int!
numHWThreads: Int
numAccelerators: Int
stats: [NamedStats!]!
}
type NamedStats {
name: String!
data: MetricStatistics!
}
type Unit {
@@ -144,20 +233,14 @@ type MetricStatistics {
max: Float!
}
type StatsSeries {
mean: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type MetricFootprints {
metric: String!
data: [NullableFloat!]!
data: [NullableFloat!]!
}
type Footprints {
timeWeights: TimeWeights!
metrics: [MetricFootprints!]!
metrics: [MetricFootprints!]!
}
type TimeWeights {
@@ -166,87 +249,221 @@ type TimeWeights {
coreHours: [NullableFloat!]!
}
enum Aggregate { USER, PROJECT, CLUSTER }
enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS }
enum Aggregate {
USER
PROJECT
CLUSTER
SUBCLUSTER
}
enum SortByAggregate {
TOTALWALLTIME
TOTALJOBS
TOTALUSERS
TOTALNODES
TOTALNODEHOURS
TOTALCORES
TOTALCOREHOURS
TOTALACCS
TOTALACCHOURS
}
type NodeMetrics {
host: String!
host: String!
state: String!
subCluster: String!
metrics: [JobMetricWithName!]!
metrics: [JobMetricWithName!]!
}
type ClusterMetrics {
nodeCount: Int!
metrics: [ClusterMetricWithName!]!
}
type NodesResultList {
items: [NodeMetrics!]!
offset: Int
limit: Int
count: Int
totalNodes: Int
hasNextPage: Boolean
}
type ClusterSupport {
cluster: String!
subClusters: [String!]!
}
type GlobalMetricListItem {
name: String!
unit: Unit!
scope: MetricScope!
footprint: String
availability: [ClusterSupport!]!
}
type Count {
name: String!
name: String!
count: Int!
}
type User {
username: String!
name: String!
email: String!
name: String!
email: String!
}
input MetricStatItem {
metricName: String!
range: FloatRange!
}
type Query {
clusters: [Cluster!]! # List of all clusters
tags: [Tag!]! # List of all tags
clusters: [Cluster!]! # List of all clusters
tags: [Tag!]! # List of all tags
globalMetrics: [GlobalMetricListItem!]!
user(username: String!): User
allocatedNodes(cluster: String!): [Count!]!
## Node Queries New
node(id: ID!): Node
nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
nodeStates(filter: [NodeFilter!]): [NodeStates!]!
nodeStatesTimed(filter: [NodeFilter!], type: String!): [NodeStatesTimed!]!
job(id: ID!): Job
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
jobMetrics(
id: ID!
metrics: [String!]
scopes: [MetricScope!]
resolution: Int
): [JobMetricWithName!]!
jobStats(id: ID!, metrics: [String!]): [NamedStats!]!
scopedJobStats(
id: ID!
metrics: [String!]
scopes: [MetricScope!]
): [NamedStatsWithScope!]!
jobs(
filter: [JobFilter!]
page: PageRequest
order: OrderByInput
): JobResultList!
jobsStatistics(
filter: [JobFilter!]
metrics: [String!]
page: PageRequest
sortBy: SortByAggregate
groupBy: Aggregate
numDurationBins: String
numMetricBins: Int
): [JobsStatistics!]!
jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]!
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
jobsStatistics(filter: [JobFilter!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate): [JobsStatistics!]!
rooflineHeatmap(
filter: [JobFilter!]!
rows: Int!
cols: Int!
minX: Float!
minY: Float!
maxX: Float!
maxY: Float!
): [[Float!]!]!
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
nodeMetrics(
cluster: String!
nodes: [String!]
scopes: [MetricScope!]
metrics: [String!]
from: Time!
to: Time!
): [NodeMetrics!]!
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
nodeMetricsList(
cluster: String!
subCluster: String!
stateFilter: String!
nodeFilter: String!
scopes: [MetricScope!]
metrics: [String!]
from: Time!
to: Time!
page: PageRequest
resolution: Int
): NodesResultList!
clusterMetrics(
cluster: String!
metrics: [String!]
from: Time!
to: Time!
): ClusterMetrics!
}
type Mutation {
createTag(type: String!, name: String!): Tag!
createTag(type: String!, name: String!, scope: String!): Tag!
deleteTag(id: ID!): ID!
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagFromList(tagIds: [ID!]!): [Int!]!
updateConfiguration(name: String!, value: String!): String
}
type IntRangeOutput { from: Int!, to: Int! }
type TimeRangeOutput { from: Time!, to: Time! }
type IntRangeOutput {
from: Int!
to: Int!
}
type TimeRangeOutput {
range: String
from: Time!
to: Time!
}
input NodeFilter {
hostname: StringInput
cluster: StringInput
subcluster: StringInput
schedulerState: SchedulerState
healthState: MonitoringState
timeStart: Int
}
input JobFilter {
tags: [ID!]
jobId: StringInput
arrayJobId: Int
user: StringInput
project: StringInput
jobName: StringInput
cluster: StringInput
partition: StringInput
duration: IntRange
tags: [ID!]
dbId: [ID!]
jobId: StringInput
arrayJobId: Int
user: StringInput
project: StringInput
jobName: StringInput
cluster: StringInput
partition: StringInput
duration: IntRange
energy: FloatRange
minRunningFor: Int
numNodes: IntRange
numNodes: IntRange
numAccelerators: IntRange
numHWThreads: IntRange
numHWThreads: IntRange
startTime: TimeRange
state: [JobState!]
flopsAnyAvg: FloatRange
memBwAvg: FloatRange
loadAvg: FloatRange
memUsedMax: FloatRange
exclusive: Int
node: StringInput
startTime: TimeRange
state: [JobState!]
metricStats: [MetricStatItem!]
shared: String
node: StringInput
}
input OrderByInput {
field: String!
type: String!
order: SortDirectionEnum! = ASC
}
@@ -256,29 +473,46 @@ enum SortDirectionEnum {
}
input StringInput {
eq: String
neq: String
contains: String
eq: String
neq: String
contains: String
startsWith: String
endsWith: String
in: [String!]
endsWith: String
in: [String!]
}
input IntRange { from: Int!, to: Int! }
input FloatRange { from: Float!, to: Float! }
input TimeRange { from: Time, to: Time }
input IntRange {
from: Int!
to: Int!
}
input TimeRange {
range: String
from: Time
to: Time
}
input FloatRange {
from: Float!
to: Float!
}
type NodeStateResultList {
items: [Node!]!
count: Int
}
type JobResultList {
items: [Job!]!
items: [Job!]!
offset: Int
limit: Int
count: Int
limit: Int
count: Int
hasNextPage: Boolean
}
type JobLinkResultList {
listQuery: String
items: [JobLink!]!
count: Int
items: [JobLink!]!
count: Int
}
type HistoPoint {
@@ -286,26 +520,42 @@ type HistoPoint {
value: Int!
}
type JobsStatistics {
id: ID! # If `groupBy` was used, ID of the user/project/cluster
name: String! # if User-Statistics: Given Name of Account (ID) Owner
totalJobs: Int! # Number of jobs
runningJobs: Int! # Number of running jobs
shortJobs: Int! # Number of jobs with a duration of less than duration
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
totalNodes: Int! # Sum of the nodes of all matched jobs
totalNodeHours: Int! # Sum of the node hours of all matched jobs
totalCores: Int! # Sum of the cores of all matched jobs
totalCoreHours: Int! # Sum of the core hours of all matched jobs
totalAccs: Int! # Sum of the accs of all matched jobs
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
type MetricHistoPoints {
metric: String!
unit: String!
stat: String
data: [MetricHistoPoint!]
}
type MetricHistoPoint {
bin: Int
count: Int!
min: Int
max: Int
}
type JobsStatistics {
id: ID! # If `groupBy` was used, ID of the user/project/cluster/subcluster
name: String! # if User-Statistics: Given Name of Account (ID) Owner
totalUsers: Int! # if *not* User-Statistics: Number of active users (based on running jobs)
totalJobs: Int! # Number of jobs
runningJobs: Int! # Number of running jobs
shortJobs: Int! # Number of jobs with a duration of less than config'd ShortRunningJobsDuration
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
totalNodes: Int! # Sum of the nodes of all matched jobs
totalNodeHours: Int! # Sum of the node hours of all matched jobs
totalCores: Int! # Sum of the cores of all matched jobs
totalCoreHours: Int! # Sum of the core hours of all matched jobs
totalAccs: Int! # Sum of the accs of all matched jobs
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
}
input PageRequest {
itemsPerPage: Int!
page: Int!
page: Int!
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,17 +0,0 @@
CC_USER=clustercockpit
CC_GROUP=clustercockpit
CC_HOME=/tmp
LOG_DIR=/var/log
DATA_DIR=/var/run/cc-backend
MAX_OPEN_FILES=10000
CONF_DIR=/etc/cc-backend
CONF_FILE=/etc/cc-backend/cc-backend.json
RESTART_ON_UPGRADE=true

View File

@@ -1,12 +0,0 @@
Package: cc-backend
Version: {VERSION}
Installed-Size: {INSTALLED_SIZE}
Architecture: {ARCH}
Maintainer: thomas.gruber@fau.de
Depends: libc6 (>= 2.2.1)
Build-Depends: debhelper-compat (= 13), git, golang-go, npm, yarn
Description: ClusterCockpit backend and web frontend
Homepage: https://github.com/ClusterCockpit/cc-backend
Source: cc-backend
Rules-Requires-Root: no

View File

@@ -1,18 +0,0 @@
[Unit]
Description=ClusterCockpit backend and web frontend (cc-backend)
Documentation=https://github.com/ClusterCockpit/cc-backend
Wants=network-online.target
After=network-online.target
[Service]
EnvironmentFile=/etc/default/cc-backend
Type=simple
User=clustercockpit
Group=clustercockpit
Restart=on-failure
TimeoutStopSec=100
LimitNOFILE=infinity
ExecStart=/usr/bin/cc-backend --config ${CONF_FILE}
[Install]
WantedBy=multi-user.target

View File

@@ -1,70 +0,0 @@
Name: cc-backend
Version: %{VERS}
Release: 1%{?dist}
Summary: ClusterCockpit backend and web frontend
License: MIT
Source0: %{name}-%{version}.tar.gz
#BuildRequires: go-toolset
#BuildRequires: systemd-rpm-macros
#BuildRequires: npm
Provides: %{name} = %{version}
%description
ClusterCockpit backend and web frontend
%global debug_package %{nil}
%prep
%autosetup
%build
#CURRENT_TIME=$(date +%Y-%m-%d:T%H:%M:\%S)
#LD_FLAGS="-s -X main.buildTime=${CURRENT_TIME} -X main.version=%{VERS}"
mkdir ./var
touch ./var/job.db
cd web/frontend && yarn install && yarn build && cd -
go build -ldflags="-s -X main.version=%{VERS}" ./cmd/cc-backend
%install
# Install cc-backend
#make PREFIX=%{buildroot} install
install -Dpm 755 cc-backend %{buildroot}/%{_bindir}/%{name}
install -Dpm 0600 configs/config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
# Integrate into system
install -Dpm 0644 build/package/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
install -Dpm 0600 build/package/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
install -Dpm 0644 build/package/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf
%check
# go test should be here... :)
%pre
%sysusers_create_package scripts/%{name}.sysusers
%post
%systemd_post %{name}.service
%preun
%systemd_preun %{name}.service
%files
# Binary
%attr(-,clustercockpit,clustercockpit) %{_bindir}/%{name}
# Config
%dir %{_sysconfdir}/%{name}
%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/%{name}.json
# Systemd
%{_unitdir}/%{name}.service
%{_sysconfdir}/default/%{name}
%{_sysusersdir}/%{name}.conf
%changelog
* Mon Mar 07 2022 Thomas Gruber - 0.1
- Initial metric store implementation

View File

@@ -1,2 +0,0 @@
#Type Name ID GECOS Home directory Shell
u clustercockpit - "User for ClusterCockpit" /run/cc-backend /sbin/nologin

38
cmd/cc-backend/cli.go Normal file
View File

@@ -0,0 +1,38 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package main provides the entry point for the ClusterCockpit backend server.
// This file defines all command-line flags and their default values.
package main
import "flag"
var (
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags bool
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
)
func cliInit() {
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize sqlite database file, config.json and .env")
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'hpc_user' table with ldap")
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
flag.BoolVar(&flagGops, "gops", false, "Listen via github.com/google/gops/agent (for debugging)")
flag.BoolVar(&flagDev, "dev", false, "Enable development components: GraphQL Playground and Swagger UI")
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")
flag.StringVar(&flagDelUser, "del-user", "", "Remove a existing user. Argument format: <username>")
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info , warn (default), err, crit]`")
flag.Parse()
}

119
cmd/cc-backend/init.go Normal file
View File

@@ -0,0 +1,119 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package main provides the entry point for the ClusterCockpit backend server.
// This file contains bootstrap logic for initializing the environment,
// creating default configuration files, and setting up the database.
package main
import (
"encoding/json"
"os"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/util"
)
const envString = `
# Base64 encoded Ed25519 keys (DO NOT USE THESE TWO IN PRODUCTION!)
# You can generate your own keypair using the gen-keypair tool
JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
# Some random bytes used as secret for cookie-based sessions (DO NOT USE THIS ONE IN PRODUCTION)
SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
`
const configString = `
{
"main": {
"addr": "127.0.0.1:8080",
"short-running-jobs-duration": 300,
"resampling": {
"minimumPoints": 600,
"trigger": 180,
"resolutions": [
240,
60
]
},
"apiAllowedIPs": [
"*"
],
"emission-constant": 317
},
"cron": {
"commit-job-worker": "2m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2000h"
}
},
"clusters": [
{
"name": "name",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2023-01-01T00:00:00Z",
"to": null
}
}
}
]
}
`
func initEnv() {
if util.CheckFileExists("var") {
cclog.Exit("Directory ./var already exists. Cautiously exiting application initialization.")
}
if err := os.WriteFile("config.json", []byte(configString), 0o666); err != nil {
cclog.Abortf("Could not write default ./config.json with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
if err := os.WriteFile(".env", []byte(envString), 0o666); err != nil {
cclog.Abortf("Could not write default ./.env file with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
if err := os.Mkdir("var", 0o777); err != nil {
cclog.Abortf("Could not create default ./var folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
err := repository.MigrateDB("./var/job.db")
if err != nil {
cclog.Abortf("Could not initialize default SQLite database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
if err := os.Mkdir("var/job-archive", 0o777); err != nil {
cclog.Abortf("Could not create default ./var/job-archive folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
archiveCfg := "{\"kind\": \"file\",\"path\": \"./var/job-archive\"}"
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
cclog.Abortf("Could not initialize job-archive, exited.\nError: %s\n", err.Error())
}
}

File diff suppressed because it is too large Load Diff

390
cmd/cc-backend/server.go Normal file
View File

@@ -0,0 +1,390 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package main provides the entry point for the ClusterCockpit backend server.
// This file contains HTTP server setup, routing configuration, and
// authentication middleware integration.
package main
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"net/http"
"os"
"strings"
"time"
"github.com/99designs/gqlgen/graphql/handler"
"github.com/99designs/gqlgen/graphql/handler/transport"
"github.com/99designs/gqlgen/graphql/playground"
"github.com/ClusterCockpit/cc-backend/internal/api"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
"github.com/ClusterCockpit/cc-backend/pkg/nats"
"github.com/ClusterCockpit/cc-backend/web"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/runtimeEnv"
"github.com/gorilla/handlers"
"github.com/gorilla/mux"
httpSwagger "github.com/swaggo/http-swagger"
)
var buildInfo web.Build
// Environment variable names
const (
envDebug = "DEBUG"
)
// Server encapsulates the HTTP server state and dependencies
type Server struct {
router *mux.Router
server *http.Server
restAPIHandle *api.RestAPI
natsAPIHandle *api.NatsAPI
}
func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusUnauthorized)
json.NewEncoder(rw).Encode(map[string]string{
"status": http.StatusText(http.StatusUnauthorized),
"error": err.Error(),
})
}
// NewServer creates and initializes a new Server instance
func NewServer(version, commit, buildDate string) (*Server, error) {
buildInfo = web.Build{Version: version, Hash: commit, Buildtime: buildDate}
s := &Server{
router: mux.NewRouter(),
}
if err := s.init(); err != nil {
return nil, err
}
return s, nil
}
func (s *Server) init() error {
// Setup the http.Handler/Router used by the server
graph.Init()
resolver := graph.GetResolverInstance()
graphQLServer := handler.New(
generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
graphQLServer.AddTransport(transport.POST{})
if os.Getenv(envDebug) != "1" {
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
// The problem with this is that then, no more stacktrace is printed to stderr.
graphQLServer.SetRecoverFunc(func(ctx context.Context, err any) error {
switch e := err.(type) {
case string:
return fmt.Errorf("MAIN > Panic: %s", e)
case error:
return fmt.Errorf("MAIN > Panic caused by: %s", e.Error())
}
return errors.New("MAIN > Internal server error (panic)")
})
}
authHandle := auth.GetAuthInstance()
s.restAPIHandle = api.New()
info := map[string]any{}
info["hasOpenIDConnect"] = false
if auth.Keys.OpenIDConfig != nil {
openIDConnect := auth.NewOIDC(authHandle)
openIDConnect.RegisterEndpoints(s.router)
info["hasOpenIDConnect"] = true
}
s.router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
cclog.Debugf("##%v##", info)
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
}).Methods(http.MethodGet)
s.router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
})
s.router.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
})
secured := s.router.PathPrefix("/").Subrouter()
securedapi := s.router.PathPrefix("/api").Subrouter()
userapi := s.router.PathPrefix("/userapi").Subrouter()
configapi := s.router.PathPrefix("/config").Subrouter()
frontendapi := s.router.PathPrefix("/frontend").Subrouter()
metricstoreapi := s.router.PathPrefix("/metricstore").Subrouter()
if !config.Keys.DisableAuthentication {
// Create login failure handler (used by both /login and /jwt-login)
loginFailureHandler := func(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Login failed - ClusterCockpit",
MsgType: "alert-warning",
Message: err.Error(),
Build: buildInfo,
Infos: info,
})
}
s.router.Handle("/login", authHandle.Login(loginFailureHandler)).Methods(http.MethodPost)
s.router.Handle("/jwt-login", authHandle.Login(loginFailureHandler))
s.router.Handle("/logout", authHandle.Logout(
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusOK)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Bye - ClusterCockpit",
MsgType: "alert-info",
Message: "Logout successful",
Build: buildInfo,
Infos: info,
})
}))).Methods(http.MethodPost)
secured.Use(func(next http.Handler) http.Handler {
return authHandle.Auth(
// On success;
next,
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Authentication failed - ClusterCockpit",
MsgType: "alert-danger",
Message: err.Error(),
Build: buildInfo,
Infos: info,
Redirect: r.RequestURI,
})
})
})
securedapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthAPI(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
userapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthUserAPI(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
metricstoreapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthMetricStoreAPI(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
configapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthConfigAPI(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
frontendapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthFrontendAPI(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
}
if flagDev {
s.router.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
s.router.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
}
secured.Handle("/query", graphQLServer)
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
routerConfig.HandleSearchBar(rw, r, buildInfo)
})
// Mount all /monitoring/... and /api/... routes.
routerConfig.SetupRoutes(secured, buildInfo)
s.restAPIHandle.MountAPIRoutes(securedapi)
s.restAPIHandle.MountUserAPIRoutes(userapi)
s.restAPIHandle.MountConfigAPIRoutes(configapi)
s.restAPIHandle.MountFrontendAPIRoutes(frontendapi)
if config.Keys.APISubjects != nil {
s.natsAPIHandle = api.NewNatsAPI()
if err := s.natsAPIHandle.StartSubscriptions(); err != nil {
return fmt.Errorf("starting NATS subscriptions: %w", err)
}
}
s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi)
if config.Keys.EmbedStaticFiles {
if i, err := os.Stat("./var/img"); err == nil {
if i.IsDir() {
cclog.Info("Use local directory for static images")
s.router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
}
}
s.router.PathPrefix("/").Handler(http.StripPrefix("/", web.ServeFiles()))
} else {
s.router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
}
s.router.Use(handlers.CompressHandler)
s.router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
s.router.Use(handlers.CORS(
handlers.AllowCredentials(),
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
handlers.AllowedOrigins([]string{"*"})))
return nil
}
// Server timeout defaults (in seconds)
const (
defaultReadTimeout = 20
defaultWriteTimeout = 20
)
func (s *Server) Start(ctx context.Context) error {
handler := handlers.CustomLoggingHandler(io.Discard, s.router, func(_ io.Writer, params handlers.LogFormatterParams) {
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
params.Request.Method, params.URL.RequestURI(),
params.StatusCode, float32(params.Size)/1024,
time.Since(params.TimeStamp).Milliseconds())
} else {
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
params.Request.Method, params.URL.RequestURI(),
params.StatusCode, float32(params.Size)/1024,
time.Since(params.TimeStamp).Milliseconds())
}
})
// Use configurable timeouts with defaults
readTimeout := time.Duration(defaultReadTimeout) * time.Second
writeTimeout := time.Duration(defaultWriteTimeout) * time.Second
s.server = &http.Server{
ReadTimeout: readTimeout,
WriteTimeout: writeTimeout,
Handler: handler,
Addr: config.Keys.Addr,
}
// Start http or https server
listener, err := net.Listen("tcp", config.Keys.Addr)
if err != nil {
return fmt.Errorf("starting listener on '%s': %w", config.Keys.Addr, err)
}
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHTTPTo != "" {
go func() {
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently))
}()
}
if config.Keys.HTTPSCertFile != "" && config.Keys.HTTPSKeyFile != "" {
cert, err := tls.LoadX509KeyPair(
config.Keys.HTTPSCertFile, config.Keys.HTTPSKeyFile)
if err != nil {
return fmt.Errorf("loading X509 keypair (check 'https-cert-file' and 'https-key-file' in config.json): %w", err)
}
listener = tls.NewListener(listener, &tls.Config{
Certificates: []tls.Certificate{cert},
CipherSuites: []uint16{
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
},
MinVersion: tls.VersionTLS12,
PreferServerCipherSuites: true,
})
cclog.Infof("HTTPS server listening at %s...", config.Keys.Addr)
} else {
cclog.Infof("HTTP server listening at %s...", config.Keys.Addr)
}
//
// Because this program will want to bind to a privileged port (like 80), the listener must
// be established first, then the user can be changed, and after that,
// the actual http server can be started.
if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
return fmt.Errorf("dropping privileges: %w", err)
}
// Handle context cancellation for graceful shutdown
go func() {
<-ctx.Done()
shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if err := s.server.Shutdown(shutdownCtx); err != nil {
cclog.Errorf("Server shutdown error: %v", err)
}
}()
if err = s.server.Serve(listener); err != nil && err != http.ErrServerClosed {
return fmt.Errorf("server failed: %w", err)
}
return nil
}
func (s *Server) Shutdown(ctx context.Context) {
// Create a shutdown context with timeout
shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
nc := nats.GetClient()
if nc != nil {
nc.Close()
}
// First shut down the server gracefully (waiting for all ongoing requests)
if err := s.server.Shutdown(shutdownCtx); err != nil {
cclog.Errorf("Server shutdown error: %v", err)
}
// Archive all the metric store data
memorystore.Shutdown()
// Shutdown archiver with 10 second timeout for fast shutdown
if err := archiver.Shutdown(10 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown: %v", err)
}
}

View File

@@ -1,92 +0,0 @@
## Intro
cc-backend requires a configuration file that specifies the cluster systems to be used.
To override the default, specify the location of a json configuration file with the `-config <file path>` command line option.
All security-related configurations, e.g. keys and passwords, are set using
environment variables.
It is supported to set these by means of a `.env` file in the project root.
## Configuration Options
* `addr`: Type string. Address where the http (or https) server will listen on (for example: 'localhost:80'). Default `:8080`.
* `user`: Type string. Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.
* `group`: Type string. Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.
* `disable-authentication`: Type bool. Disable authentication (for everything: API, Web-UI, ...). Default `false`.
* `embed-static-files`: Type bool. If all files in `web/frontend/public` should be served from within the binary itself (they are embedded) or not. Default `true`.
* `static-files`: Type string. Folder where static assets can be found, if `embed-static-files` is `false`. No default.
* `db-driver`: Type string. 'sqlite3' or 'mysql' (mysql will work for mariadb as well). Default `sqlite3`.
* `db`: Type string. For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!). Default: `./var/job.db`.
* `job-archive`: Type object.
- `kind`: Type string. At them moment only file is supported as value.
- `path`: Type string. Path to the job-archive. Default: `./var/job-archive`.
- `compression`: Type integer. Setup automatic compression for jobs older than number of days.
- `retention`: Type object.
- `policy`: Type string (required). Retention policy. Possible values none, delete,
move.
- `includeDB`: Type boolean. Also remove jobs from database.
- `age`: Type integer. Act on jobs with startTime older than age (in days).
- `location`: Type string. The target directory for retention. Only applicable for retention policy move.
* `disable-archive`: Type bool. Keep all metric data in the metric data repositories, do not write to the job-archive. Default `false`.
* `validate`: Type bool. Validate all input json documents against json schema.
* `session-max-age`: Type string. Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire! Default `168h`.
* `https-cert-file` and `https-key-file`: Type string. If both those options are not empty, use HTTPS using those certificates.
* `redirect-http-to`: Type string. If not the empty string and `addr` does not end in ":80", redirect every request incoming at port 80 to that url.
* `machine-state-dir`: Type string. Where to store MachineState files. TODO: Explain in more detail!
* `stop-jobs-exceeding-walltime`: Type int. If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job. Default `0`.
* `short-running-jobs-duration`: Type int. Do not show running jobs shorter than X seconds. Default `300`.
* `jwts`: Type object (required). For JWT Authentication.
- `max-age`: Type string (required). Configure how long a token is valid. As string parsable by time.ParseDuration().
- `cookieName`: Type string. Cookie that should be checked for a JWT token.
- `vaidateUser`: Type boolean. Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.
- `trustedIssuer`: Type string. Issuer that should be accepted when validating external JWTs.
- `syncUserOnLogin`: Type boolean. Add non-existent user to DB at login attempt with values provided in JWT.
* `ldap`: Type object. For LDAP Authentication and user synchronisation. Default `nil`.
- `url`: Type string (required). URL of LDAP directory server.
- `user_base`: Type string (required). Base DN of user tree root.
- `search_dn`: Type string (required). DN for authenticating LDAP admin account with general read rights.
- `user_bind`: Type string (required). Expression used to authenticate users via LDAP bind. Must contain `uid={username}`.
- `user_filter`: Type string (required). Filter to extract users for syncing.
- `username_attr`: Type string. Attribute with full user name. Defaults to `gecos` if not provided.
- `sync_interval`: Type string. Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.
- `sync_del_old_users`: Type boolean. Delete obsolete users in database.
- `syncUserOnLogin`: Type boolean. Add non-existent user to DB at login attempt if user exists in Ldap directory.
* `clusters`: Type array of objects (required)
- `name`: Type string. The name of the cluster.
- `metricDataRepository`: Type object with properties: `kind` (Type string, can be one of `cc-metric-store`, `influxdb` ), `url` (Type string), `token` (Type string)
- `filterRanges` Type object. This option controls the slider ranges for the UI controls of numNodes, duration, and startTime. Example:
```
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
```
* `ui-defaults`: Type object. Default configuration for ui views. If overwritten, all options must be provided! Most options can be overwritten by the user via the web interface.
- `analysis_view_histogramMetrics`: Type string array. Metrics to show as job count histograms in analysis view. Default `["flops_any", "mem_bw", "mem_used"]`.
- `analysis_view_scatterPlotMetrics`: Type array of string array. Initial
scatter plot configuration in analysis view. Default `[["flops_any", "mem_bw"], ["flops_any", "cpu_load"], ["cpu_load", "mem_bw"]]`.
- `job_view_nodestats_selectedMetrics`: Type string array. Initial metrics shown in node statistics table of single job view. Default `["flops_any", "mem_bw", "mem_used"]`.
- `job_view_polarPlotMetrics`: Type string array. Metrics shown in polar plot of single job view. Default `["flops_any", "mem_bw", "mem_used", "net_bw", "file_bw"]`.
- `job_view_selectedMetrics`: Type string array. Default `["flops_any", "mem_bw", "mem_used"]`.
- `plot_general_colorBackground`: Type bool. Color plot background according to job average threshold limits. Default `true`.
- `plot_general_colorscheme`: Type string array. Initial color scheme. Default `"#00bfff", "#0000ff", "#ff00ff", "#ff0000", "#ff8000", "#ffff00", "#80ff00"`.
- `plot_general_lineWidth`: Type int. Initial linewidth. Default `3`.
- `plot_list_jobsPerPage`: Type int. Jobs shown per page in job lists. Default `50`.
- `plot_list_selectedMetrics`: Type string array. Initial metric plots shown in jobs lists. Default `"cpu_load", "ipc", "mem_used", "flops_any", "mem_bw"`.
- `plot_view_plotsPerRow`: Type int. Number of plots per row in single job view. Default `3`.
- `plot_view_showPolarplot`: Type bool. Option to toggle polar plot in single job view. Default `true`.
- `plot_view_showRoofline`: Type bool. Option to toggle roofline plot in single job view. Default `true`.
- `plot_view_showStatTable`: Type bool. Option to toggle the node statistic table in single job view. Default `true`.
- `system_view_selectedMetric`: Type string. Initial metric shown in system view. Default `cpu_load`.
Some of the `ui-defaults` values can be appended by `:<clustername>` in order to have different settings depending on the current cluster. Those are notably `job_view_nodestats_selectedMetrics`, `job_view_polarPlotMetrics`, `job_view_selectedMetrics` and `plot_list_selectedMetrics`.
## Environment Variables
An example env file is found in this directory. Copy it to `.env` in the project root and adapt it for your needs.
* `JWT_PUBLIC_KEY` and `JWT_PRIVATE_KEY`: Base64 encoded Ed25519 keys used for JSON Web Token (JWT) authentication. You can generate your own keypair using `go run ./cmd/gen-keypair/gen-keypair.go`. More information in [README_TOKENS.md](./README_TOKENS.md).
* `SESSION_KEY`: Some random bytes used as secret for cookie-based sessions.
* `LDAP_ADMIN_PASSWORD`: The LDAP admin user password (optional).
* `CROSS_LOGIN_JWT_HS512_KEY`: Used for token based logins via another authentication service.
* `LOGLEVEL`: Can be `err`, `warn`, `info` or `debug` (optional, `warn` by default). Can be used to reduce logging.

View File

@@ -1,51 +0,0 @@
## Introduction
ClusterCockpit uses JSON Web Tokens (JWT) for authorization of its APIs.
JSON Web Token (JWT) is an open standard (RFC 7519) that defines a compact and self-contained way for securely transmitting information between parties as a JSON object.
This information can be verified and trusted because it is digitally signed.
In ClusterCockpit JWTs are signed using a public/private key pair using ECDSA.
Because tokens are signed using public/private key pairs, the signature also certifies that only the party holding the private key is the one that signed it.
Expiration of the generated tokens as well as the max. length of a browser session can be configured in the `config.json` file described [here](./README.md).
The [Ed25519](https://ed25519.cr.yp.to/) algorithm for signatures was used because it is compatible with other tools that require authentication, such as NATS.io, and because these elliptic-curve methods provide simillar security with smaller keys compared to something like RSA. They are sligthly more expensive to validate, but that effect is negligible.
## JWT Payload
You may view the payload of a JWT token at [https://jwt.io/#debugger-io](https://jwt.io/#debugger-io).
Currently ClusterCockpit sets the following claims:
* `iat`: Issued at claim. The “iat” claim is used to identify the the time at which the JWT was issued. This claim can be used to determine the age of the JWT.
* `sub`: Subject claim. Identifies the subject of the JWT, in our case this is the username.
* `roles`: An array of strings specifying the roles set for the subject.
* `exp`: Expiration date of the token (only if explicitly configured)
It is important to know that JWTs are not encrypted, only signed. This means that outsiders cannot create new JWTs or modify existing ones, but they are able to read out the username.
## Workflow
1. Create a new ECDSA Public/private keypair:
```
$ go build ./cmd/gen-keypair/
$ ./gen-keypair
```
2. Add keypair in your `.env` file. A template can be found in `./configs`.
When a user logs in via the `/login` page using a browser, a session cookie (secured using the random bytes in the `SESSION_KEY` env. variable you shoud change as well) is used for all requests after the successfull login. The JWTs make it easier to use the APIs of ClusterCockpit using scripts or other external programs. The token is specified n the `Authorization` HTTP header using the [Bearer schema](https://datatracker.ietf.org/doc/html/rfc6750) (there is an example below). Tokens can be issued to users from the configuration view in the Web-UI or the command line. In order to use the token for API endpoints such as `/api/jobs/start_job/`, the user that executes it needs to have the `api` role. Regular users can only perform read-only queries and only look at data connected to jobs they started themselves.
## cc-metric-store
The [cc-metric-store](https://github.com/ClusterCockpit/cc-metric-store) also uses JWTs for authentication. As it does not issue new tokens, it does not need to kown the private key. The public key of the keypair that is used to generate the JWTs that grant access to the `cc-metric-store` can be specified in its `config.json`. When configuring the `metricDataRepository` object in the `cluster.json` file, you can put a token issued by ClusterCockpit itself.
## Setup user and JWT token for REST API authorization
1. Create user:
```
$ ./cc-backend --add-user <username>:api:<password> --no-server
```
2. Issue token for user:
```
$ ./cc-backend --jwt <username> --no-server
```
3. Use issued token token on client side:
```
$ curl -X GET "<API ENDPOINT>" -H "accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer <JWT TOKEN>"
```

View File

@@ -1,56 +1,96 @@
{
"main": {
"addr": "127.0.0.1:8080",
"archive": {
"kind": "file",
"path": "./var/job-archive"
"short-running-jobs-duration": 300,
"resampling": {
"minimumPoints": 600,
"trigger": 180,
"resolutions": [
240,
60
]
},
"apiAllowedIPs": [
"*"
],
"emission-constant": 317
},
"cron": {
"commit-job-worker": "2m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2m"
},
"clusters": [
{
"name": "fritz",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
"max-age": "2000h"
}
},
"nats": {
"address": "nats://0.0.0.0:4222",
"username": "root",
"password": "root"
},
"clusters": [
{
"name": "fritz",
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
{
"name": "alex",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "alex",
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}
],
"metric-store": {
"checkpoints": {
"file-format": "avro",
"interval": "1h",
"directory": "./var/checkpoints",
"restore": "48h"
},
"archive": {
"interval": "1h",
"directory": "./var/archive"
},
"retention-in-memory": "48h",
"subscriptions": [
{
"subscribe-to": "hpc-nats",
"cluster-tag": "fritz"
},
{
"subscribe-to": "hpc-nats",
"cluster-tag": "alex"
}
]
}
}
}

View File

@@ -1,50 +1,49 @@
{
"main": {
"addr": "0.0.0.0:443",
"ldap": {
"url": "ldaps://test",
"user_base": "ou=people,ou=hpc,dc=test,dc=de",
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=test,dc=de",
"user_bind": "uid={username},ou=people,ou=hpc,dc=test,dc=de",
"user_filter": "(&(objectclass=posixAccount))"
},
"https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem",
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
"user": "clustercockpit",
"group": "clustercockpit",
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"validate": true,
"clusters": [
{
"name": "test",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "eyJhbGciOiJF-E-pQBQ"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
"validate": false,
"apiAllowedIPs": ["*"],
"short-running-jobs-duration": 300,
"resampling": {
"minimumPoints": 600,
"trigger": 180,
"resolutions": [
240,
60
]
}
},
"cron": {
"commit-job-worker": "2m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"clusters": [
{
"name": "test",
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
],
"jwts": {
"cookieName": "",
"validateUser": false,
"max-age": "2m",
"trustedIssuer": ""
},
"short-running-jobs-duration": 300
}
}
]
}

View File

@@ -117,10 +117,12 @@ foreach my $ln (split("\n", $topo)) {
my $node;
my @sockets;
my @nodeCores;
foreach my $socket ( @{$DOMAINS{socket}} ) {
push @sockets, "[".join(",", @{$socket})."]";
$node .= join(",", @{$socket})
push @nodeCores, join(",", @{$socket});
}
$node = join(",", @nodeCores);
$INFO{sockets} = join(",\n", @sockets);
my @memDomains;
@@ -212,9 +214,27 @@ print <<"END";
"socketsPerNode": $INFO{socketsPerNode},
"coresPerSocket": $INFO{coresPerSocket},
"threadsPerCore": $INFO{threadsPerCore},
"flopRateScalar": $flopsScalar,
"flopRateSimd": $flopsSimd,
"memoryBandwidth": $memBw,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": $flopsScalar
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": $flopsSimd
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": $memBw
},
"nodes": "<FILL IN NODE RANGES>",
"topology": {
"node": [$node],

View File

@@ -0,0 +1,22 @@
{
"cluster": "fritz",
"jobId": 123000,
"jobState": "running",
"numAcc": 0,
"numHwthreads": 72,
"numNodes": 1,
"partition": "main",
"requestedMemory": 128000,
"resources": [{ "hostname": "f0726" }],
"startTime": 1649723812,
"subCluster": "main",
"submitTime": 1649723812,
"user": "k106eb10",
"project": "k106eb",
"walltime": 86400,
"metaData": {
"slurmInfo": "JobId=398759\nJobName=myJob\nUserId=dummyUser\nGroupId=dummyGroup\nAccount=dummyAccount\nQOS=normal Requeue=False Restarts=0 BatchFlag=True\nTimeLimit=1439'\nSubmitTime=2023-02-09T14:10:18\nPartition=singlenode\nNodeList=xx\nNumNodes=xx NumCPUs=72 NumTasks=72 CPUs/Task=1\nNTasksPerNode:Socket:Core=0:None:None\nTRES_req=cpu=72,mem=250000M,node=1,billing=72\nTRES_alloc=cpu=72,node=1,billing=72\nCommand=myCmd\nWorkDir=myDir\nStdErr=\nStdOut=\n",
"jobScript": "#!/bin/bash -l\n#SBATCH --job-name=dummy_job\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/dummy/\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\n\n#This is a dummy job script\n./mybinary\n",
"jobName": "ams_pipeline"
}
}

View File

@@ -0,0 +1,7 @@
{
"cluster": "fritz",
"jobId": 123000,
"jobState": "completed",
"startTime": 1649723812,
"stopTime": 1649763839
}

45
configs/uiConfig.json Normal file
View File

@@ -0,0 +1,45 @@
{
"jobList": {
"usePaging": false,
"showFootprint":false
},
"jobView": {
"showPolarPlot": true,
"showFootprint": true,
"showRoofline": true,
"showStatTable": true
},
"metricConfig": {
"jobListMetrics": ["mem_bw", "flops_dp"],
"jobViewPlotMetrics": ["mem_bw", "flops_dp"],
"jobViewTableMetrics": ["mem_bw", "flops_dp"],
"clusters": [
{
"name": "test",
"subClusters": [
{
"name": "one",
"jobListMetrics": ["mem_used", "flops_sp"]
}
]
}
]
},
"nodeList": {
"usePaging": true
},
"plotConfiguration": {
"plotsPerRow": 3,
"colorBackground": true,
"lineWidth": 3,
"colorScheme": [
"#00bfff",
"#0000ff",
"#ff00ff",
"#ff0000",
"#ff8000",
"#ffff00",
"#80ff00"
]
}
}

View File

@@ -1,38 +0,0 @@
# Release versions
Versions are marked according to [semantic versioning] (https://semver.org).
Each version embeds the following static assets in the binary:
* Web frontend with javascript files and all static assets.
* Golang template files for server-side rendering.
* JSON schema files for validation.
* Database migration files.
The remaining external assets are:
* The SQL database used.
* The job archive
* The configuration files `config.json` and `.env`.
The external assets are versioned with integer IDs.
This means that each release binary is bound to specific versions of the SQL
database and the job archive.
The configuration file is checked against the current schema at startup.
The `-migrate-db` command line switch can be used to upgrade the SQL database
to migrate from a previous version to the latest one.
We offer a separate tool `archive-migration` to migrate an existing job archive
archive from the previous to the latest version.
# Versioning of APIs
cc-backend provides two API backends:
* A REST API for querying jobs.
* A GraphQL API for data exchange between web frontend and cc-backend.
The REST API will also be versioned. We still have to decide whether we will also
support older REST API versions by versioning the endpoint URLs.
The GraphQL API is for internal use and will not be versioned.
# How to build
In general it is recommended to use the provided release binary.
In case you want to build build `cc-backend` please always use the provided makefile. This will ensure
that the frontend is also built correctly and that the version in the binary is encoded in the binary.

View File

@@ -1,234 +0,0 @@
# Hands-on setup ClusterCockpit from scratch (w/o docker)
## Prerequisites
* perl
* go
* npm
* Optional: curl
* Script migrateTimestamp.pl
## Documentation
You find READMEs or api docs in
* ./cc-backend/configs
* ./cc-backend/init
* ./cc-backend/api
## ClusterCockpit configuration files
### cc-backend
* `./.env` Passwords and Tokens set in the environment
* `./config.json` Configuration options for cc-backend
### cc-metric-store
* `./config.json` Optional to overwrite configuration options
### cc-metric-collector
Not yet included in the hands-on setup.
## Setup Components
Start by creating a base folder for all of the following steps.
* `mkdir clustercockpit`
* `cd clustercockpit`
### Setup cc-backend
* Clone Repository
- `git clone https://github.com/ClusterCockpit/cc-backend.git`
- `cd cc-backend`
* Build
- `make`
* Activate & configure environment for cc-backend
- `cp configs/env-template.txt .env`
- Optional: Have a look via `vim .env`
- Copy the `config.json` file included in this tarball into the root directory of cc-backend: `cp ../../config.json ./`
* Back to toplevel `clustercockpit`
- `cd ..`
* Prepare Datafolder and Database file
- `mkdir var`
- `./cc-backend -migrate-db`
### Setup cc-metric-store
* Clone Repository
- `git clone https://github.com/ClusterCockpit/cc-metric-store.git`
- `cd cc-metric-store`
* Build Go Executable
- `go get`
- `go build`
* Prepare Datafolders
- `mkdir -p var/checkpoints`
- `mkdir -p var/archive`
* Update Config
- `vim config.json`
- Exchange existing setting in `metrics` with the following:
```
"clock": { "frequency": 60, "aggregation": null },
"cpi": { "frequency": 60, "aggregation": null },
"cpu_load": { "frequency": 60, "aggregation": null },
"flops_any": { "frequency": 60, "aggregation": null },
"flops_dp": { "frequency": 60, "aggregation": null },
"flops_sp": { "frequency": 60, "aggregation": null },
"ib_bw": { "frequency": 60, "aggregation": null },
"lustre_bw": { "frequency": 60, "aggregation": null },
"mem_bw": { "frequency": 60, "aggregation": null },
"mem_used": { "frequency": 60, "aggregation": null },
"rapl_power": { "frequency": 60, "aggregation": null }
```
* Back to toplevel `clustercockpit`
- `cd ..`
### Setup Demo Data
* `mkdir source-data`
* `cd source-data`
* Download JobArchive-Source:
- `wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-dev.tar.xz`
- `tar xJf job-archive-dev.tar.xz`
- `mv ./job-archive ./job-archive-source`
- `rm ./job-archive-dev.tar.xz`
* Download CC-Metric-Store Checkpoints:
- `mkdir -p cc-metric-store-source/checkpoints`
- `cd cc-metric-store-source/checkpoints`
- `wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/cc-metric-store-checkpoints.tar.xz`
- `tar xf cc-metric-store-checkpoints.tar.xz`
- `rm cc-metric-store-checkpoints.tar.xz`
* Back to `source-data`
- `cd ../..`
* Run timestamp migration script. This may take tens of minutes!
- `cp ../migrateTimestamps.pl .`
- `./migrateTimestamps.pl`
- Expected output:
```
Starting to update start- and stoptimes in job-archive for emmy
Starting to update start- and stoptimes in job-archive for woody
Done for job-archive
Starting to update checkpoint filenames and data starttimes for emmy
Starting to update checkpoint filenames and data starttimes for woody
Done for checkpoints
```
* Copy `cluster.json` files from source to migrated folders
- `cp source-data/job-archive-source/emmy/cluster.json cc-backend/var/job-archive/emmy/`
- `cp source-data/job-archive-source/woody/cluster.json cc-backend/var/job-archive/woody/`
* Initialize Job-Archive in SQLite3 job.db and add demo user
- `cd cc-backend`
- `./cc-backend -init-db -add-user demo:admin:demo`
- Expected output:
```
<6>[INFO] new user "demo" created (roles: ["admin"], auth-source: 0)
<6>[INFO] Building job table...
<6>[INFO] A total of 3936 jobs have been registered in 1.791 seconds.
```
* Back to toplevel `clustercockpit`
- `cd ..`
### Startup both Apps
* In cc-backend root: `$./cc-backend -server -dev`
- Starts Clustercockpit at `http:localhost:8080`
- Log: `<6>[INFO] HTTP server listening at :8080...`
- Use local internet browser to access interface
- You should see and be able to browse finished Jobs
- Metadata is read from SQLite3 database
- Metricdata is read from job-archive/JSON-Files
- Create User in settings (top-right corner)
- Name `apiuser`
- Username `apiuser`
- Role `API`
- Submit & Refresh Page
- Create JTW for `apiuser`
- In Userlist, press `Gen. JTW` for `apiuser`
- Save JWT for later use
* In cc-metric-store root: `$./cc-metric-store`
- Start the cc-metric-store on `http:localhost:8081`, Log:
```
2022/07/15 17:17:42 Loading checkpoints newer than 2022-07-13T17:17:42+02:00
2022/07/15 17:17:45 Checkpoints loaded (5621 files, 319 MB, that took 3.034652s)
2022/07/15 17:17:45 API http endpoint listening on '0.0.0.0:8081'
```
- Does *not* have a graphical interface
- Otpional: Test function by executing:
```
$ curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw" -D - "http://localhost:8081/api/query" -d "{ \"cluster\": \"emmy\", \"from\": $(expr $(date +%s) - 60), \"to\": $(date +%s), \"queries\": [{
\"metric\": \"flops_any\",
\"host\": \"e1111\"
}] }"
HTTP/1.1 200 OK
Content-Type: application/json
Date: Fri, 15 Jul 2022 13:57:22 GMT
Content-Length: 119
{"results":[[JSON-DATA-ARRAY]]}
```
### Development API web interfaces
The `-dev` flag enables web interfaces to document and test the apis:
* http://localhost:8080/playground - A GraphQL playground. To use it you must have a authenticated session in the same browser.
* http://localhost:8080/swagger - A Swagger UI. To use it you have to be logged out, so no user session in the same browser. Use the JWT token with role Api generate previously to authenticate via http header.
### Use cc-backend API to start job
* Enter the URL `http://localhost:8080/swagger/index.html` in your browser.
* Enter your JWT token you generated for the API user by clicking the green Authorize button in the upper right part of the window.
* Click the `/job/start_job` endpoint and click the Try it out button.
* Enter the following json into the request body text area and fill in a recent start timestamp by executing `date +%s`.:
```
{
"jobId": 100000,
"arrayJobId": 0,
"user": "ccdemouser",
"subCluster": "main",
"cluster": "emmy",
"startTime": <date +%s>,
"project": "ccdemoproject",
"resources": [
{"hostname": "e0601"},
{"hostname": "e0823"},
{"hostname": "e0337"},
{"hostname": "e1111"}],
"numNodes": 4,
"numHwthreads": 80,
"walltime": 86400
}
```
* The response body should be the database id of the started job, for example:
```
{
"id": 3937
}
```
* Check in ClusterCockpit
- User `ccdemouser` should appear in Users-Tab with one running job
- It could take up to 5 Minutes until the Job is displayed with some current data (5 Min Short-Job Filter)
- Job then is marked with a green `running` tag
- Metricdata displayed is read from cc-metric-store!
### Use cc-backend API to stop job
* Enter the URL `http://localhost:8080/swagger/index.html` in your browser.
* Enter your JWT token you generated for the API user by clicking the green Authorize button in the upper right part of the window.
* Click the `/job/stop_job/{id}` endpoint and click the Try it out button.
* Enter the database id at id that was returned by `start_job` and copy the following into the request body. Replace the timestamp with a recent one:
```
{
"cluster": "emmy",
"jobState": "completed",
"stopTime": <RECENT TS>
}
```
* On success a json document with the job meta data is returned.
* Check in ClusterCockpit
- User `ccdemouser` should appear in Users-Tab with one completed job
- Job is no longer marked with a green `running` tag -> Completed!
- Metricdata displayed is now read from job-archive!
* Check in job-archive
- `cd ./cc-backend/var/job-archive/emmy/100/000`
- `cd $STARTTIME`
- Inspect `meta.json` and `data.json`
## Helper scripts
* In this tarball you can find the perl script `generate_subcluster.pl` that helps to generate the subcluster section for your system.
Usage:
* Log into an exclusive cluster node.
* The LIKWID tools likwid-topology and likwid-bench must be in the PATH!
* `$./generate_subcluster.pl` outputs the subcluster section on `stdout`
Please be aware that
* You have to enter the name and node list for the subCluster manually.
* GPU detection only works if LIKWID was build with Cuda avalable and you run likwid-topology also with Cuda loaded.
* Do not blindly trust the measured peakflops values.
* Because the script blindly relies on the CSV format output by likwid-topology this is a fragile undertaking!

View File

@@ -1,99 +0,0 @@
## Introduction
ClusterCockpit uses JSON Web Tokens (JWT) for authorization of its APIs. JSON
Web Token (JWT) is an open standard (RFC 7519) that defines a compact and
self-contained way for securely transmitting information between parties as a
JSON object. This information can be verified and trusted because it is
digitally signed. In ClusterCockpit JWTs are signed using a public/private key
pair using ECDSA. Because tokens are signed using public/private key pairs, the
signature also certifies that only the party holding the private key is the one
that signed it. Token expiration is set to the configuration option MaxAge.
## JWT Payload
You may view the payload of a JWT token at [https://jwt.io/#debugger-io](https://jwt.io/#debugger-io).
Currently ClusterCockpit sets the following claims:
* `iat`: Issued at claim. The “iat” claim is used to identify the the time at which the JWT was issued. This claim can be used to determine the age of the JWT.
* `sub`: Subject claim. Identifies the subject of the JWT, in our case this is the username.
* `roles`: An array of strings specifying the roles set for the subject.
## Workflow
1. Create a new ECDSA Public/private keypair:
```
$ go build ./tools/gen-keypair.go
$ ./gen-keypair
```
2. Add keypair in your `.env` file. A template can be found in `./configs`.
There are two usage scenarios:
* The APIs are used during a browser session. API accesses are authorized with
the active session.
* The REST API is used outside a browser session, e.g. by scripts. In this case
you have to issue a token manually. This possible from within the
configuration view or on the command line. It is recommended to issue a JWT
token in this case for a special user that only has the `api` role. By using
different users for different purposes a fine grained access control and
access revocation management is possible.
The token is commonly specified in the Authorization HTTP header using the Bearer schema.
## Setup user and JWT token for REST API authorization
1. Create user:
```
$ ./cc-backend --add-user <username>:api:<Password> --no-server
```
2. Issue token for user:
```
$ ./cc-backend -jwt <username> -no-server
```
3. Use issued token token on client side:
```
$ curl -X GET "<API ENDPOINT>" -H "accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer <JWT TOKEN>"
```
## Accept externally generated JWTs provided via cookie
If there is an external service like an AuthAPI that can generate JWTs and hand
them over to ClusterCockpit via cookies, CC can be configured to accept them:
1. `.env`: CC needs a public ed25519 key to verify foreign JWT signatures.
Public keys in PEM format can be converted with the instructions in
[/tools/convert-pem-pubkey-for-cc](../tools/convert-pem-pubkey-for-cc/Readme.md)
.
```
CROSS_LOGIN_JWT_PUBLIC_KEY="+51iXX8BdLFocrppRxIw52xCOf8xFSH/eNilN5IHVGc="
```
2. `config.json`: Insert a name for the cookie (set by the external service)
containing the JWT so that CC knows where to look at. Define a trusted issuer
(JWT claim 'iss'), otherwise it will be rejected. If you want usernames and
user roles from JWTs ('sub' and 'roles' claim) to be validated against CC's
internal database, you need to enable it here. Unknown users will then be
rejected and roles set via JWT will be ignored.
```json
"jwts": {
"cookieName": "access_cc",
"forceJWTValidationViaDatabase": true,
"trustedExternalIssuer": "auth.example.com"
}
```
3. Make sure your external service includes the same issuer (`iss`) in its JWTs.
Example JWT payload:
```json
{
"iat": 1668161471,
"nbf": 1668161471,
"exp": 1668161531,
"sub": "alice",
"roles": [
"user"
],
"jti": "a1b2c3d4-1234-5678-abcd-a1b2c3d4e5f6",
"iss": "auth.example.com"
}
```

View File

@@ -1,78 +0,0 @@
The job archive specifies an exchange format for job meta and performance metric
data. It consists of two parts:
* a [SQLite database schema](https://github.com/ClusterCockpit/cc-backend/wiki/Job-Archive#sqlite-database-schema) for job meta data and performance statistics
* a [Json file format](https://github.com/ClusterCockpit/cc-backend/wiki/Job-Archive#json-file-format) together with a [Directory hierarchy specification](https://github.com/ClusterCockpit/cc-backend/wiki/Job-Archive#directory-hierarchy-specification)
By using an open, portable and simple specification based on files it is
possible to exchange job performance data for research and analysis purposes as
well as use it as a robust way for archiving job performance data to disk.
# SQLite database schema
## Introduction
A SQLite 3 database schema is provided to standardize the job meta data
information in a portable way. The schema also includes optional columns for job
performance statistics (called a job performance footprint). The database acts
as a front end to filter and select subsets of job IDs, that are the keys to get
the full job performance data in the job performance tree hierarchy.
## Database schema
The schema includes 3 tables: the job table, a tag table and a jobtag table
representing the MANY-TO-MANY relation between jobs and tags. The SQL schema is
specified
[here](https://github.com/ClusterCockpit/cc-specifications/blob/master/schemas/jobs-sqlite.sql).
Explanation of the various columns including the JSON datatypes is documented
[here](https://github.com/ClusterCockpit/cc-specifications/blob/master/datastructures/job-meta.schema.json).
# Directory hierarchy specification
## Specification
To manage the number of directories within a single directory a tree approach is
used splitting the integer job ID. The job id is split in junks of 1000 each.
Usually 2 layers of directories is sufficient but the concept can be used for an
arbitrary number of layers.
For a 2 layer schema this can be achieved with (code example in Perl):
``` perl
$level1 = $jobID/1000;
$level2 = $jobID%1000;
$dstPath = sprintf("%s/%s/%d/%03d", $trunk, $destdir, $level1, $level2);
```
## Example
For the job ID 1034871 the directory path is `./1034/871/`.
# Json file format
## Overview
Every cluster must be configured in a `cluster.json` file.
The job data consists of two files:
* `meta.json`: Contains job meta information and job statistics.
* `data.json`: Contains complete job data with time series
The description of the json format specification is available as [[json
schema|https://json-schema.org/]] format file. The latest version of the json
schema is part of the `cc-backend` source tree. For external reference it is
also available in a separate repository.
## Specification `cluster.json`
The json schema specification is available
[here](https://github.com/ClusterCockpit/cc-specifications/blob/master/datastructures/cluster.schema.json).
## Specification `meta.json`
The json schema specification is available
[here](https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-meta.schema.json).
## Specification `data.json`
The json schema specification is available
[here](https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-data.schema.json).
Metric time series data is stored for a fixed time step. The time step is set
per metric. If no value is available for a metric time series data timestamp
`null` is entered.

View File

@@ -1,29 +0,0 @@
# Overview
Customizing `cc-backend` means changing the logo, legal texts, and the login
template instead of the placeholders. You can also place a text file in `./var`
to add dynamic status or notification messages to the clusterCockpit homepage.
# Replace legal texts
To replace the `imprint.tmpl` and `privacy.tmpl` legal texts, you can place your
version in `./var/`. At startup `cc-backend` will check if `./var/imprint.tmpl` and/or
`./var/privacy.tmpl` exist and use them instead of the built-in placeholders.
You can use the placeholders in `web/templates` as a blueprint.
# Replace login template
To replace the default login layout and styling, you can place your version in
`./var/`. At startup `cc-backend` will check if `./var/login.tmpl` exist and use
it instead of the built-in placeholder. You can use the default temaplte
`web/templates/login.tmpl` as a blueprint.
# Replace logo
To change the logo displayed in the navigation bar, you can provide the file
`logo.png` in the folder `./var/img/`. On startup `cc-backend` will check if the
folder exists and use the images provided there instead of the built-in images.
You may also place additional images there you use in a custom login template.
# Add notification banner on homepage
To add a notification banner you can add a file `notice.txt` to `./var`. As long
as this file is present all text in this file is shown in an info banner on the
homepage.

View File

@@ -1,78 +0,0 @@
In general, an upgrade is nothing more than a replacement of the binary file.
All the necessary files, except the database file, the configuration file and
the job archive, are embedded in the binary file. It is recommended to use a
directory where the file names of the binary files are named with a version
indicator. This can be, for example, the date or the Unix epoch time. A symbolic
link points to the version to be used. This makes it easier to switch to earlier
versions.
The database and the job archive are versioned. Each release binary supports
specific versions of the database and job archive. If a version mismatch is
detected, the application is terminated and migration is required.
**IMPORTANT NOTE**
It is recommended to make a backup copy of the database before each update. This
is mandatory in case the database needs to be migrated. In the case of sqlite,
this means to stopping `cc-backend` and copying the sqlite database file
somewhere.
# Migrating the database
After you have backed up the database, run the following command to migrate the
database to the latest version:
```
$ ./cc-backend -migrate-db
```
The migration files are embedded in the binary and can also be viewed in the cc
backend [source tree](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/repository/migrations).
There are separate migration files for both supported
database backends.
We use the [migrate library](https://github.com/golang-migrate/migrate).
If something goes wrong, you can check the status and get the current schema
(here for sqlite):
```
$ sqlite3 var/job.db
```
In the sqlite console execute:
```
.schema
```
to get the current databse schema.
You can query the current version and whether the migration failed with:
```
SELECT * FROM schema_migrations;
```
The first column indicates the current database version and the second column is
a dirty flag indicating whether the migration was successful.
# Migrating the job archive
Job archive migration requires a separate tool (`archive-migration`), which is
part of the cc-backend source tree (build with `go build ./tools/archive-migration`)
and is also provided as part of the releases.
Migration is supported only between two successive releases. The migration tool
migrates the existing job archive to a new job archive. This means that there
must be enough disk space for two complete job archives. If the tool is called
without options:
```
$ ./archive-migration
```
it is assumed that a job archive exists in `./var/job-archive`. The new job
archive is written to `./var/job-archive-new`. Since execution is threaded in case
of a fatal error, it is impossible to determine in which job the error occurred.
In this case, you can run the tool in debug mode (with the `-debug` flag). In
debug mode, threading is disabled and the job ID of each migrated job is output.
Jobs with empty files will be skipped. Between multiple runs of the tools, the
`job-archive-new` directory must be moved or deleted.
The `cluster.json` files in `job-archive-new` must be checked for errors, especially
whether the aggregation attribute is set correctly for all metrics.
Migration takes several hours for relatively large job archives (several hundred
GB). A versioned job archive contains a version.txt file in the root directory
of the job archive. This file contains the version as an unsigned integer.

View File

@@ -1,180 +0,0 @@
# Overview
The authentication is implemented in `internal/auth/`. In `auth.go`
an interface is defined that any authentication provider must fulfill. It also
acts as a dispatcher to delegate the calls to the available authentication
providers.
Two authentication types are available:
* JWT authentication for the REST API that does not create a session cookie
* Session based authentication using a session cookie
The most important routines in auth are:
* `Login()` Handle POST request to login user and start a new session
* `Auth()` Authenticate user and put User Object in context of the request
The http router calls auth in the following cases:
* `r.Handle("/login", authentication.Login( ... )).Methods(http.MethodPost)`:
The POST request on the `/login` route will call the Login callback.
* `r.Handle("/jwt-login", authentication.Login( ... ))`:
Any request on the `/jwt-login` route will call the Login callback. Intended
for use for the JWT token based authenticators.
* Any route in the secured subrouter will always call Auth(), on success it will
call the next handler in the chain, on failure it will render the login
template.
```
secured.Use(func(next http.Handler) http.Handler {
return authentication.Auth(
// On success;
next,
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
// Render login form
})
})
```
A JWT token can be used to initiate an authenticated user
session. This can either happen by calling the login route with a token
provided in a header or via a special cookie containing the JWT token.
For API routes the access is authenticated on every request using the JWT token
and no session is initiated.
# Login
The Login function (located in `auth.go`):
* Extracts the user name and gets the user from the user database table. In case the
user is not found the user object is set to nil.
* Iterates over all authenticators and:
- Calls its `CanLogin` function which checks if the authentication method is
supported for this user.
- Calls its `Login` function to authenticate the user. On success a valid user
object is returned.
- Creates a new session object, stores the user attributes in the session and
saves the session.
- Starts the `onSuccess` http handler
## Local authenticator
This authenticator is applied if
```
return user != nil && user.AuthSource == AuthViaLocalPassword
```
Compares the password provided by the login form to the password hash stored in
the user database table:
```
if e := bcrypt.CompareHashAndPassword([]byte(user.Password), []byte(r.FormValue("password"))); e != nil {
log.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
return nil, fmt.Errorf("Authentication failed")
}
```
## LDAP authenticator
This authenticator is applied if the user was found in the database and its
AuthSource is LDAP:
```
if user != nil {
if user.AuthSource == schema.AuthViaLDAP {
return user, true
}
}
```
If the option `SyncUserOnLogin` is set it tried to sync the user from the LDAP
directory. In case this succeeds the user is persisted to the database and can
login.
Gets the LDAP connection and tries a bind with the provided credentials:
```
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
log.Errorf("AUTH/LDAP > Authentication for user %s failed: %v", user.Username, err)
return nil, fmt.Errorf("Authentication failed")
}
```
## JWT Session authenticator
Login via JWT token will create a session without password.
For login the `X-Auth-Token` header is not supported. This authenticator is
applied if the Authorization header or query parameter login-token is present:
```
return user, r.Header.Get("Authorization") != "" ||
r.URL.Query().Get("login-token") != ""
```
The Login function:
* Parses the token and checks if it is expired
* Check if the signing method is EdDSA or HS256 or HS512
* Check if claims are valid and extracts the claims
* The following claims have to be present:
- `sub`: The subject, in this case this is the username
- `exp`: Expiration in Unix epoch time
- `roles`: String array with roles of user
* In case user does not exist in the database and the option `SyncUserOnLogin`
is set add user to user database table with `AuthViaToken` AuthSource.
* Return valid user object
## JWT Cookie Session authenticator
Login via JWT cookie token will create a session without password.
It is first checked if the required configuration options are set:
* `trustedIssuer`
* `CookieName`
and optionally the environment variable `CROSS_LOGIN_JWT_PUBLIC_KEY` is set.
This authenticator is applied if the configured cookie is present:
```
jwtCookie, err := r.Cookie(cookieName)
if err == nil && jwtCookie.Value != "" {
return true
}
```
The Login function:
* Extracts and parses the token
* Checks if signing method is Ed25519/EdDSA
* In case publicKeyCrossLogin is configured:
- Check if `iss` issuer claim matched trusted issuer from configuration
- Return public cross login key
- Otherwise return standard public key
* Check if claims are valid
* Depending on the option `validateUser` the roles are
extracted from JWT token or taken from user object fetched from database
* Ask browser to delete the JWT cookie
* In case user does not exist in the database and the option `SyncUserOnLogin`
is set add user to user database table with `AuthViaToken` AuthSource.
* Return valid user object
# Auth
The Auth function (located in `auth.go`):
* Returns a new http handler function that is defined right away
* This handler tries two methods to authenticate a user:
- Via a JWT API token in `AuthViaJWT()`
- Via a valid session in `AuthViaSession()`
* If err is not nil and the user object is valid it puts the user object in the
request context and starts the onSuccess http handler
* Otherwise it calls the onFailure handler
## AuthViaJWT
Implemented in JWTAuthenticator:
* Extract token either from header `X-Auth-Token` or `Authorization` with Bearer
prefix
* Parse token and check if it is valid. The Parse routine will also check if the
token is expired.
* If the option `validateUser` is set it will ensure the
user object exists in the database and takes the roles from the database user
* Otherwise the roles are extracted from the roles claim
* Returns a valid user object with AuthType set to AuthToken
## AuthViaSession
* Extracts session
* Get values username, projects, and roles from session
* Returns a valid user object with AuthType set to AuthSession

View File

@@ -1,33 +0,0 @@
## Tips for frontend development
The frontend assets including the Svelte js files are per default embedded in
the bgo binary. To enable a quick turnaround cycle for web development of the
frontend disable embedding of static assets in `config.json`:
```
"embed-static-files": false,
"static-files": "./web/frontend/public/",
```
Start the node build process (in directory `./web/frontend`) in development mode:
```
$ npm run dev
```
This will start the build process in listen mode. Whenever you change a source
files the depending javascript targets will be automatically rebuild.
In case the javascript files are minified you may need to set the production
flag by hand to false in `./web/frontend/rollup.config.mjs`:
```
const production = false
```
Usually this should work automatically.
Because the files are still served by ./cc-backend you have to reload the view
explicitly in your browser.
A common setup is to have three terminals open:
* One running cc-backend (working directory repository root): `./cc-backend -server -dev`
* Another running npm in developer mode (working directory `./web/frontend`): `npm run dev`
* And the last one editing the frontend source files

View File

@@ -1,13 +0,0 @@
# Steps to prepare a release
1. On `hotfix` branch:
* Update ReleaseNotes.md
* Update version in Makefile
* Commit, push, and pull request
* Merge in master
2. On Linux host:
* Pull master
* Ensure that GitHub Token environment variable `GITHUB_TOKEN` is set
* Create release tag: `git tag v1.1.0 -m release`
* Execute `goreleaser release`

View File

@@ -1,34 +0,0 @@
## Overview
We use the standard golang testing environment.
The following conventions are used:
* *White box unit tests*: Tests for internal functionality are placed in files
* *Black box unit tests*: Tests for public interfaces are placed in files
with `<package name>_test.go` and belong to the package `<package_name>_test`.
There only exists one package test file per package.
* *Integration tests*: Tests that use multiple componenents are placed in a
package test file. These are named `<package name>_test.go` and belong to the
package `<package_name>_test`.
* *Test assets*: Any required files are placed in a directory `./testdata`
within each package directory.
## Executing tests
Visual Studio Code has a very good golang test integration.
For debugging a test this is the recommended solution.
The Makefile provided by us has a `test` target that executes:
```
$ go clean -testcache
$ go build ./...
$ go vet ./...
$ go test ./...
```
Of course the commands can also be used on the command line.
For details about golang testing refer to the standard documentation:
* [Testing package](https://pkg.go.dev/testing)
* [go test command](https://pkg.go.dev/cmd/go#hdr-Test_packages)

View File

@@ -1,229 +0,0 @@
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use JSON::PP; # from Perl default install
use Time::Local qw( timelocal ); # from Perl default install
use Time::Piece; # from Perl default install
### JSON
my $json = JSON::PP->new->allow_nonref;
### TIME AND DATE
# now
my $localtime = localtime;
my $epochtime = $localtime->epoch;
# 5 days ago: Via epoch due to possible reverse month borders
my $epochlessfive = $epochtime - (86400 * 5);
my $locallessfive = localtime($epochlessfive);
# Calc like `date --date 'TZ="Europe/Berlin" 0:00 5 days ago' +%s`)
my ($day, $month, $year) = ($locallessfive->mday, $locallessfive->_mon, $locallessfive->year);
my $checkpointStart = timelocal(0, 0, 0, $day, $month, $year);
# for checkpoints
my $halfday = 43200;
### JOB-ARCHIVE
my $archiveTarget = './cc-backend/var/job-archive';
my $archiveSrc = './source-data/job-archive-source';
my @ArchiveClusters;
# Gen folder
if ( not -d $archiveTarget ){
mkdir( $archiveTarget ) or die "Couldn't create $archiveTarget directory, $!";
}
# Get clusters by job-archive/$subfolder
opendir my $dh, $archiveSrc or die "can't open directory: $!";
while ( readdir $dh ) {
chomp; next if $_ eq '.' or $_ eq '..' or $_ eq 'job-archive';
my $cluster = $_;
push @ArchiveClusters, $cluster;
}
# start for jobarchive
foreach my $cluster ( @ArchiveClusters ) {
print "Starting to update start- and stoptimes in job-archive for $cluster\n";
my $clusterTarget = "$archiveTarget/$cluster";
if ( not -d $clusterTarget ){
mkdir( $clusterTarget ) or die "Couldn't create $clusterTarget directory, $!";
}
opendir my $dhLevel1, "$archiveSrc/$cluster" or die "can't open directory: $!";
while ( readdir $dhLevel1 ) {
chomp; next if $_ eq '.' or $_ eq '..';
my $level1 = $_;
if ( -d "$archiveSrc/$cluster/$level1" ) {
opendir my $dhLevel2, "$archiveSrc/$cluster/$level1" or die "can't open directory: $!";
while ( readdir $dhLevel2 ) {
chomp; next if $_ eq '.' or $_ eq '..';
my $level2 = $_;
my $jobSource = "$archiveSrc/$cluster/$level1/$level2";
my $jobOrigin = "$jobSource";
my $jobTargetL1 = "$clusterTarget/$level1";
my $jobTargetL2 = "$jobTargetL1/$level2";
# check if files are directly accessible (old format) else get subfolders as file and update path
if ( ! -e "$jobSource/meta.json") {
opendir(D, "$jobSource") || die "Can't open directory $jobSource: $!\n";
my @folders = readdir(D);
closedir(D);
if (!@folders) {
next;
}
foreach my $folder ( @folders ) {
next if $folder eq '.' or $folder eq '..';
$jobSource = "$jobSource/".$folder;
}
}
# check if subfolder contains file, else skip
if ( ! -e "$jobSource/meta.json") {
print "$jobSource skipped\n";
next;
}
open my $metafh, '<', "$jobSource/meta.json" or die "Can't open file $!";
my $rawstr = do { local $/; <$metafh> };
close($metafh);
my $metadata = $json->decode($rawstr);
# NOTE Start meta.json iteration here
# my $random_number = int(rand(UPPERLIMIT)) + LOWERLIMIT;
# Set new startTime: Between 5 days and 1 day before now
# Remove id from attributes
$metadata->{startTime} = $epochtime - (int(rand(432000)) + 86400);
$metadata->{stopTime} = $metadata->{startTime} + $metadata->{duration};
# Add starttime subfolder to target path
my $jobTargetL3 = "$jobTargetL2/".$metadata->{startTime};
if ( not -d $jobTargetL1 ){
mkdir( $jobTargetL1 ) or die "Couldn't create $jobTargetL1 directory, $!";
}
if ( not -d $jobTargetL2 ){
mkdir( $jobTargetL2 ) or die "Couldn't create $jobTargetL2 directory, $!";
}
# target is not directory
if ( not -d $jobTargetL3 ){
mkdir( $jobTargetL3 ) or die "Couldn't create $jobTargetL3 directory, $!";
my $outstr = $json->encode($metadata);
open my $metaout, '>', "$jobTargetL3/meta.json" or die "Can't write to file $!";
print $metaout $outstr;
close($metaout);
open my $datafh, '<', "$jobSource/data.json" or die "Can't open file $!";
my $datastr = do { local $/; <$datafh> };
close($datafh);
open my $dataout, '>', "$jobTargetL3/data.json" or die "Can't write to file $!";
print $dataout $datastr;
close($dataout);
}
}
}
}
}
print "Done for job-archive\n";
sleep(1);
exit;
## CHECKPOINTS
my $checkpTarget = './cc-metric-store/var/checkpoints';
my $checkpSource = './source-data/cc-metric-store-source/checkpoints';
my @CheckpClusters;
# Gen folder
if ( not -d $checkpTarget ){
mkdir( $checkpTarget ) or die "Couldn't create $checkpTarget directory, $!";
}
# Get clusters by cc-metric-store/$subfolder
opendir my $dhc, $checkpSource or die "can't open directory: $!";
while ( readdir $dhc ) {
chomp; next if $_ eq '.' or $_ eq '..' or $_ eq 'job-archive';
my $cluster = $_;
push @CheckpClusters, $cluster;
}
closedir($dhc);
# start for checkpoints
foreach my $cluster ( @CheckpClusters ) {
print "Starting to update checkpoint filenames and data starttimes for $cluster\n";
my $clusterTarget = "$checkpTarget/$cluster";
if ( not -d $clusterTarget ){
mkdir( $clusterTarget ) or die "Couldn't create $clusterTarget directory, $!";
}
opendir my $dhLevel1, "$checkpSource/$cluster" or die "can't open directory: $!";
while ( readdir $dhLevel1 ) {
chomp; next if $_ eq '.' or $_ eq '..';
# Nodename as level1-folder
my $level1 = $_;
if ( -d "$checkpSource/$cluster/$level1" ) {
my $nodeSource = "$checkpSource/$cluster/$level1/";
my $nodeOrigin = "$nodeSource";
my $nodeTarget = "$clusterTarget/$level1";
my @files;
if ( -e "$nodeSource/1609459200.json") { # 1609459200 == First Checkpoint time in latest dump
opendir(D, "$nodeSource") || die "Can't open directory $nodeSource: $!\n";
while ( readdir D ) {
chomp; next if $_ eq '.' or $_ eq '..';
my $nodeFile = $_;
push @files, $nodeFile;
}
closedir(D);
my $length = @files;
if (!@files || $length != 14) { # needs 14 files == 7 days worth of data
next;
}
} else {
next;
}
# sort for integer timestamp-filename-part (moduleless): Guarantees start with index == 0 == 1609459200.json
my @sortedFiles = sort { ($a =~ /^([0-9]{10}).json$/)[0] <=> ($b =~ /^([0-9]{10}).json$/)[0] } @files;
if ( not -d $nodeTarget ){
mkdir( $nodeTarget ) or die "Couldn't create $nodeTarget directory, $!";
while (my ($index, $file) = each(@sortedFiles)) {
open my $checkfh, '<', "$nodeSource/$file" or die "Can't open file $!";
my $rawstr = do { local $/; <$checkfh> };
close($checkfh);
my $checkpdata = $json->decode($rawstr);
my $newTimestamp = $checkpointStart + ($index * $halfday);
# Get Diff from old Timestamp
my $timeDiff = $newTimestamp - $checkpdata->{from};
# Set new timestamp
$checkpdata->{from} = $newTimestamp;
foreach my $metric (keys %{$checkpdata->{metrics}}) {
$checkpdata->{metrics}->{$metric}->{start} += $timeDiff;
}
my $outstr = $json->encode($checkpdata);
open my $checkout, '>', "$nodeTarget/$newTimestamp.json" or die "Can't write to file $!";
print $checkout $outstr;
close($checkout);
}
}
}
}
closedir($dhLevel1);
}
print "Done for checkpoints\n";

View File

@@ -1,36 +0,0 @@
# Docs for ClusterCockpit Searchbar
## Usage
* Searchtags are implemented as `type:<query>` search-string
* Types `jobId, jobName, projectId, username, name, arrayJobId` for roles `admin` and `support`
* `jobName` is jobName as persisted in `job.meta_data` table-column
* `username` is actual account identifier as persisted in `job.user` table-column
* `name` is account owners name as persisted in `user.name` table-column
* Types `jobId, jobName, projectId, arrayJobId` for role `user`
* Examples:
* `jobName:myJob12`
* `jobId:123456`
* `username:abcd100`
* `name:Paul`
* If no searchTag used: Best guess search with the following hierarchy
* `jobId -> username -> name -> projectId -> jobName`
* Destinations:
* JobId: Job-Table (Allows multiple identical matches, e.g. JobIds from different clusters)
* JobName: Job-Table (Allows multiple identical matches, e.g. JobNames from different clusters)
* ProjectId: Job-Table
* Username: Users-Table
* **Please Note**: Only users with jobs will be shown in table! I.e., Users without jobs will be missing in table. Also, a `Last 30 Days` is active by default and might filter out expected users.
* Name: Users-Table
* **Please Note**: Only users with jobs will be shown in table! I.e., Users without jobs will be missing in table. Also, a `Last 30 Days` is active by default and might filter out expected users.
* ArrayJobId: Job-Table (Lists all Jobs of Queried ArrayJobId)
* Best guess search always redirects to Job-Table or `/monitoring/user/$USER` (first username match)
* Unprocessable queries will display messages detailing the cause (Info, Warning, Error)
* Spaces trimmed (both for searchTag and queryString)
* ` job12` == `job12`
* `projectID : abcd ` == `projectId:abcd`
* `jobName`- and `name-`queries work with a part of the target-string
* `jobName:myjob` for jobName "myjob_cluster1"
* `name:Paul` for name "Paul Atreides"
* JobName GQL Query is resolved as matching the query as a part of the whole metaData-JSON in the SQL DB.

170
go.mod
View File

@@ -1,88 +1,126 @@
module github.com/ClusterCockpit/cc-backend
go 1.18
go 1.24.0
toolchain go1.24.1
tool (
github.com/99designs/gqlgen
github.com/swaggo/swag/cmd/swag
)
require (
github.com/99designs/gqlgen v0.17.36
github.com/ClusterCockpit/cc-units v0.4.0
github.com/Masterminds/squirrel v1.5.3
github.com/go-co-op/gocron v1.25.0
github.com/go-ldap/ldap/v3 v3.4.4
github.com/go-sql-driver/mysql v1.7.0
github.com/golang-jwt/jwt/v4 v4.5.0
github.com/golang-migrate/migrate/v4 v4.15.2
github.com/google/gops v0.3.27
github.com/gorilla/handlers v1.5.1
github.com/gorilla/mux v1.8.0
github.com/gorilla/sessions v1.2.1
github.com/influxdata/influxdb-client-go/v2 v2.12.2
github.com/jmoiron/sqlx v1.3.5
github.com/mattn/go-sqlite3 v1.14.16
github.com/prometheus/client_golang v1.14.0
github.com/prometheus/common v0.40.0
github.com/99designs/gqlgen v0.17.84
github.com/ClusterCockpit/cc-lib v1.0.2
github.com/Masterminds/squirrel v1.5.4
github.com/aws/aws-sdk-go-v2 v1.41.0
github.com/aws/aws-sdk-go-v2/config v1.31.20
github.com/aws/aws-sdk-go-v2/credentials v1.18.24
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2
github.com/coreos/go-oidc/v3 v3.16.0
github.com/expr-lang/expr v1.17.6
github.com/go-co-op/gocron/v2 v2.18.2
github.com/go-ldap/ldap/v3 v3.4.12
github.com/golang-jwt/jwt/v5 v5.3.0
github.com/golang-migrate/migrate/v4 v4.19.1
github.com/google/gops v0.3.28
github.com/gorilla/handlers v1.5.2
github.com/gorilla/mux v1.8.1
github.com/gorilla/sessions v1.4.0
github.com/influxdata/line-protocol/v2 v2.2.1
github.com/jmoiron/sqlx v1.4.0
github.com/joho/godotenv v1.5.1
github.com/linkedin/goavro/v2 v2.14.1
github.com/mattn/go-sqlite3 v1.14.32
github.com/nats-io/nats.go v1.47.0
github.com/prometheus/client_golang v1.23.2
github.com/prometheus/common v0.67.4
github.com/qustavo/sqlhooks/v2 v2.1.0
github.com/santhosh-tekuri/jsonschema/v5 v5.2.0
github.com/swaggo/http-swagger v1.3.3
github.com/swaggo/swag v1.16.1
github.com/vektah/gqlparser/v2 v2.5.8
golang.org/x/crypto v0.12.0
golang.org/x/exp v0.0.0-20230510235704-dd950f8aeaea
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
github.com/stretchr/testify v1.11.1
github.com/swaggo/http-swagger v1.3.4
github.com/swaggo/swag v1.16.6
github.com/vektah/gqlparser/v2 v2.5.31
golang.org/x/crypto v0.45.0
golang.org/x/oauth2 v0.32.0
golang.org/x/time v0.14.0
)
require (
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
github.com/KyleBanks/depth v1.2.1 // indirect
github.com/agnivade/levenshtein v1.1.1 // indirect
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/agnivade/levenshtein v1.2.1 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 // indirect
github.com/aws/smithy-go v1.24.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/containerd/containerd v1.6.18 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
github.com/deepmap/oapi-codegen v1.12.4 // indirect
github.com/felixge/httpsnoop v1.0.3 // indirect
github.com/go-asn1-ber/asn1-ber v1.5.4 // indirect
github.com/go-openapi/jsonpointer v0.20.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/spec v0.20.9 // indirect
github.com/go-openapi/swag v0.22.4 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/gorilla/securecookie v1.1.1 // indirect
github.com/gorilla/websocket v1.5.0 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.3 // indirect
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.9.0 // indirect
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 // indirect
github.com/go-jose/go-jose/v4 v4.1.3 // indirect
github.com/go-openapi/jsonpointer v0.22.3 // indirect
github.com/go-openapi/jsonreference v0.21.3 // indirect
github.com/go-openapi/spec v0.22.1 // indirect
github.com/go-openapi/swag/conv v0.25.4 // indirect
github.com/go-openapi/swag/jsonname v0.25.4 // indirect
github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
github.com/go-openapi/swag/loading v0.25.4 // indirect
github.com/go-openapi/swag/stringutils v0.25.4 // indirect
github.com/go-openapi/swag/typeutils v0.25.4 // indirect
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
github.com/goccy/go-yaml v1.19.0 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/securecookie v1.1.2 // indirect
github.com/gorilla/websocket v1.5.3 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/jonboulle/clockwork v0.5.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.18.1 // indirect
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/opencontainers/image-spec v1.0.3-0.20211202183452-c5a74bcca799 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/procfs v0.9.0 // indirect
github.com/nats-io/nkeys v0.4.11 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/swaggo/files v1.0.0 // indirect
github.com/urfave/cli/v2 v2.25.7 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
go.uber.org/atomic v1.10.0 // indirect
golang.org/x/mod v0.12.0 // indirect
golang.org/x/net v0.14.0 // indirect
golang.org/x/oauth2 v0.5.0 // indirect
golang.org/x/sys v0.11.0 // indirect
golang.org/x/text v0.12.0 // indirect
golang.org/x/tools v0.12.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.30.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
github.com/sosodev/duration v1.3.1 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/swaggo/files v1.0.1 // indirect
github.com/urfave/cli/v2 v2.27.7 // indirect
github.com/urfave/cli/v3 v3.6.1 // indirect
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/mod v0.30.0 // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/sync v0.18.0 // indirect
golang.org/x/sys v0.38.0 // indirect
golang.org/x/text v0.31.0 // indirect
golang.org/x/tools v0.39.0 // indirect
google.golang.org/protobuf v1.36.10 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)

2148
go.sum

File diff suppressed because it is too large Load Diff

View File

@@ -30,7 +30,9 @@ resolver:
# gqlgen will search for any type names in the schema in these go packages
# if they match it will use them, otherwise it will generate them.
autobind:
- "github.com/99designs/gqlgen/graphql/introspection"
- "github.com/ClusterCockpit/cc-backend/internal/graph/model"
- "github.com/ClusterCockpit/cc-backend/internal/config"
# This section declares type mapping between the GraphQL and go type systems
#
@@ -50,34 +52,51 @@ models:
- github.com/99designs/gqlgen/graphql.Int64
- github.com/99designs/gqlgen/graphql.Int32
Job:
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Job"
model: "github.com/ClusterCockpit/cc-lib/schema.Job"
fields:
tags:
resolver: true
metaData:
resolver: true
Cluster:
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Cluster"
model: "github.com/ClusterCockpit/cc-lib/schema.Cluster"
fields:
partitions:
resolver: true
NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
MetricValue: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
JobState: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
TimeRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
IntRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.IntRange" }
JobMetric: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Series" }
MetricStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricStatistics" }
MetricConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricConfig" }
SubClusterConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubClusterConfig" }
Accelerator: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Accelerator" }
Topology: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Topology" }
FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }
# Node:
# model: "github.com/ClusterCockpit/cc-lib/schema.Node"
# fields:
# metaData:
# resolver: true
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricScope" }
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricValue" }
JobStatistics:
{ model: "github.com/ClusterCockpit/cc-lib/schema.JobStatistics" }
GlobalMetricListItem:
{ model: "github.com/ClusterCockpit/cc-lib/schema.GlobalMetricListItem" }
ClusterSupport:
{ model: "github.com/ClusterCockpit/cc-lib/schema.ClusterSupport" }
Tag: { model: "github.com/ClusterCockpit/cc-lib/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-lib/schema.Resource" }
JobState: { model: "github.com/ClusterCockpit/cc-lib/schema.JobState" }
Node: { model: "github.com/ClusterCockpit/cc-lib/schema.Node" }
SchedulerState:
{ model: "github.com/ClusterCockpit/cc-lib/schema.SchedulerState" }
HealthState:
{ model: "github.com/ClusterCockpit/cc-lib/schema.MonitoringState" }
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-lib/schema.Series" }
MetricStatistics:
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricStatistics" }
MetricConfig:
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricConfig" }
SubClusterConfig:
{ model: "github.com/ClusterCockpit/cc-lib/schema.SubClusterConfig" }
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/schema.Accelerator" }
Topology: { model: "github.com/ClusterCockpit/cc-lib/schema.Topology" }
FilterRanges:
{ model: "github.com/ClusterCockpit/cc-lib/schema.FilterRanges" }
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/schema.SubCluster" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-lib/schema.Unit" }

View File

@@ -1,9 +1,9 @@
[Unit]
Description=ClusterCockpit Web Server (Go edition)
Description=ClusterCockpit Web Server
Documentation=https://github.com/ClusterCockpit/cc-backend
Wants=network-online.target
After=network-online.target
After=mariadb.service mysql.service
# Database is file-based SQLite - no service dependency required
[Service]
WorkingDirectory=/opt/monitoring/cc-backend

View File

@@ -1,5 +1,5 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api_test
@@ -14,38 +14,49 @@ import (
"os"
"path/filepath"
"reflect"
"strconv"
"strings"
"testing"
"time"
"sync"
"github.com/ClusterCockpit/cc-backend/internal/api"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/gorilla/mux"
_ "github.com/mattn/go-sqlite3"
)
func setup(t *testing.T) *api.RestApi {
func setup(t *testing.T) *api.RestAPI {
const testconfig = `{
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"apiAllowedIPs": [
"*"
]
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"jwts": {
"max-age": "2m"
},
"auth": {
"jwts": {
"max-age": "2m"
}
},
"clusters": [
{
"name": "testcluster",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
@@ -54,7 +65,7 @@ func setup(t *testing.T) *api.RestApi {
}
]
}`
const testclusterJson = `{
const testclusterJSON = `{
"name": "testcluster",
"subClusters": [
{
@@ -110,97 +121,108 @@ func setup(t *testing.T) *api.RestApi {
]
}`
log.Init("info", true)
cclog.Init("info", true)
tmpdir := t.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
if err := os.Mkdir(jobarchive, 0777); err != nil {
if err := os.Mkdir(jobarchive, 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
t.Fatal(err)
}
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0777); err != nil {
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJson), 0666); err != nil {
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
t.Fatal(err)
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB("sqlite3", dbfilepath)
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
cfgFilePath := filepath.Join(tmpdir, "config.json")
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0666); err != nil {
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
t.Fatal(err)
}
config.Init(cfgFilePath)
ccconf.Init(cfgFilePath)
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
config.Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
} else {
cclog.Abort("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
repository.Connect("sqlite3", dbfilepath)
db := repository.GetConnection()
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
t.Fatal(err)
}
if err := metricdata.Init(config.Keys.DisableArchive); err != nil {
t.Fatal(err)
// Initialize memorystore (optional - will return nil if not configured)
// For this test, we don't initialize it to test the nil handling
mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg != nil {
var wg sync.WaitGroup
memorystore.Init(mscfg, &wg)
}
jobRepo := repository.GetJobRepository()
resolver := &graph.Resolver{DB: db.DB, Repo: jobRepo}
archiver.Start(repository.GetJobRepository(), context.Background())
return &api.RestApi{
JobRepository: resolver.Repo,
Resolver: resolver,
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
auth.Init(&cfg)
} else {
cclog.Warn("Authentication disabled due to missing configuration")
auth.Init(nil)
}
graph.Init()
return api.New()
}
func cleanup() {
// TODO: Clear all caches, reset all modules, etc...
// Gracefully shutdown archiver with timeout
if err := archiver.Shutdown(5 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
}
// Shutdown memorystore if it was initialized
memorystore.Shutdown()
}
/*
* This function starts a job, stops it, and then reads its data from the job-archive.
* Do not run sub-tests in parallel! Tests should not be run in parallel at all, because
* at least `setup` modifies global state.
* This function starts a job, stops it, and tests the REST API.
* Do not run sub-tests in parallel! Tests should not be run in parallel at all, because
* at least `setup` modifies global state.
*/
func TestRestApi(t *testing.T) {
restapi := setup(t)
t.Cleanup(cleanup)
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
},
},
}
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
return testData, nil
}
r := mux.NewRouter()
restapi.MountRoutes(r)
r.PathPrefix("/api").Subrouter()
r.StrictSlash(true)
restapi.MountAPIRoutes(r)
var TestJobId int64 = 123
TestClusterName := "testcluster"
var TestStartTime int64 = 123456789
const startJobBody string = `{
"jobId": 123,
"jobId": 123,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
@@ -210,10 +232,9 @@ func TestRestApi(t *testing.T) {
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"exclusive": 1,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"tags": [{ "type": "testTagType", "name": "testTagName" }],
"resources": [
{
"hostname": "host123",
@@ -224,28 +245,28 @@ func TestRestApi(t *testing.T) {
"startTime": 123456789
}`
var dbid int64
const contextUserKey repository.ContextKey = "user"
contextUserValue := &schema.User{
Username: "testuser",
Projects: make([]string, 0),
Roles: []string{"user"},
AuthType: 0,
AuthSource: 2,
}
if ok := t.Run("StartJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusCreated {
t.Fatal(response.Status, recorder.Body.String())
}
var res api.StartJobApiResponse
if err := json.Unmarshal(recorder.Body.Bytes(), &res); err != nil {
t.Fatal(err)
}
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(res.DBID)))
if err != nil {
t.Fatal(err)
}
job.Tags, err = restapi.Resolver.Job().Tags(context.Background(), job)
restapi.JobRepository.SyncJobs()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
@@ -257,23 +278,16 @@ func TestRestApi(t *testing.T) {
job.SubCluster != "sc1" ||
job.Partition != "default" ||
job.Walltime != 3600 ||
job.ArrayJobId != 0 ||
job.ArrayJobID != 0 ||
job.NumNodes != 1 ||
job.NumHWThreads != 8 ||
job.NumAcc != 0 ||
job.Exclusive != 1 ||
job.MonitoringStatus != 1 ||
job.SMT != 1 ||
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
job.StartTime.Unix() != 123456789 {
job.StartTime != 123456789 {
t.Fatalf("unexpected job properties: %#v", job)
}
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" {
t.Fatalf("unexpected tags: %#v", job.Tags)
}
dbid = res.DBID
}); !ok {
return
}
@@ -287,19 +301,20 @@ func TestRestApi(t *testing.T) {
"stopTime": 123457789
}`
var stoppedJob *schema.Job
if ok := t.Run("StopJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusOK {
t.Fatal(response.Status, recorder.Body.String())
}
restapi.JobRepository.WaitForArchiving()
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(dbid)))
// Archiving happens asynchronously, will be completed in cleanup
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
@@ -321,30 +336,23 @@ func TestRestApi(t *testing.T) {
t.Fatalf("unexpected job.metaData: %#v", job.MetaData)
}
stoppedJob = job
}); !ok {
return
}
t.Run("CheckArchive", func(t *testing.T) {
data, err := metricdata.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background())
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(data, testData) {
t.Fatal("unexpected data fetched from archive")
}
})
// Note: We skip the CheckArchive test because without memorystore initialized,
// archiving will fail gracefully. This test now focuses on the REST API itself.
t.Run("CheckDoubleStart", func(t *testing.T) {
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!
body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1)
body := strings.ReplaceAll(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`)
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(body)))
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusUnprocessableEntity {
t.Fatal(response.Status, recorder.Body.String())
@@ -359,7 +367,7 @@ func TestRestApi(t *testing.T) {
"partition": "default",
"walltime": 3600,
"numNodes": 1,
"exclusive": 1,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
@@ -371,10 +379,12 @@ func TestRestApi(t *testing.T) {
}`
ok := t.Run("StartJobFailed", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusCreated {
t.Fatal(response.Status, recorder.Body.String())
@@ -384,8 +394,11 @@ func TestRestApi(t *testing.T) {
t.Fatal("subtest failed")
}
time.Sleep(1 * time.Second)
restapi.JobRepository.SyncJobs()
const stopJobBodyFailed string = `{
"jobId": 12345,
"jobId": 12345,
"cluster": "testcluster",
"jobState": "failed",
@@ -393,16 +406,18 @@ func TestRestApi(t *testing.T) {
}`
ok = t.Run("StopJobFailed", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusOK {
t.Fatal(response.Status, recorder.Body.String())
}
restapi.JobRepository.WaitForArchiving()
// Archiving happens asynchronously, will be completed in cleanup
jobid, cluster := int64(12345), "testcluster"
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
if err != nil {

71
internal/api/cluster.go Normal file
View File

@@ -0,0 +1,71 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"encoding/json"
"fmt"
"net/http"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-lib/schema"
)
// GetClustersAPIResponse model
type GetClustersAPIResponse struct {
Clusters []*schema.Cluster `json:"clusters"` // Array of clusters
}
// getClusters godoc
// @summary Lists all cluster configs
// @tags Cluster query
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
// @produce json
// @param cluster query string false "Job Cluster"
// @success 200 {object} api.GetClustersApiResponse "Array of clusters"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/clusters/ [get]
func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); user != nil &&
!user.HasRole(schema.RoleApi) {
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
bw := bufio.NewWriter(rw)
defer bw.Flush()
var clusters []*schema.Cluster
if r.URL.Query().Has("cluster") {
name := r.URL.Query().Get("cluster")
cluster := archive.GetCluster(name)
if cluster == nil {
handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw)
return
}
clusters = append(clusters, cluster)
} else {
clusters = archive.Clusters
}
payload := GetClustersAPIResponse{
Clusters: clusters,
}
if err := json.NewEncoder(bw).Encode(payload); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
}

File diff suppressed because it is too large Load Diff

1024
internal/api/job.go Normal file

File diff suppressed because it is too large Load Diff

170
internal/api/memorystore.go Normal file
View File

@@ -0,0 +1,170 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strconv"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/influxdata/line-protocol/v2/lineprotocol"
)
// handleFree godoc
// @summary
// @tags free
// @description This endpoint allows the users to free the Buffers from the
// metric store. This endpoint offers the users to remove then systematically
// and also allows then to prune the data under node, if they do not want to
// remove the whole node.
// @produce json
// @param to query string false "up to timestamp"
// @success 200 {string} string "ok"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /free/ [post]
func freeMetrics(rw http.ResponseWriter, r *http.Request) {
rawTo := r.URL.Query().Get("to")
if rawTo == "" {
handleError(errors.New("'to' is a required query parameter"), http.StatusBadRequest, rw)
return
}
to, err := strconv.ParseInt(rawTo, 10, 64)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
bodyDec := json.NewDecoder(r.Body)
var selectors [][]string
err = bodyDec.Decode(&selectors)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
ms := memorystore.GetMemoryStore()
n := 0
for _, sel := range selectors {
bn, err := ms.Free(sel, to)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
n += bn
}
rw.WriteHeader(http.StatusOK)
fmt.Fprintf(rw, "buffers freed: %d\n", n)
}
// handleWrite godoc
// @summary Receive metrics in InfluxDB line-protocol
// @tags write
// @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)
// @accept plain
// @produce json
// @param cluster query string false "If the lines in the body do not have a cluster tag, use this value instead."
// @success 200 {string} string "ok"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /write/ [post]
func writeMetrics(rw http.ResponseWriter, r *http.Request) {
bytes, err := io.ReadAll(r.Body)
rw.Header().Add("Content-Type", "application/json")
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
ms := memorystore.GetMemoryStore()
dec := lineprotocol.NewDecoderWithBytes(bytes)
if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
cclog.Errorf("/api/write error: %s", err.Error())
handleError(err, http.StatusBadRequest, rw)
return
}
rw.WriteHeader(http.StatusOK)
}
// handleDebug godoc
// @summary Debug endpoint
// @tags debug
// @description This endpoint allows the users to print the content of
// nodes/clusters/metrics to review the state of the data.
// @produce json
// @param selector query string false "Selector"
// @success 200 {string} string "Debug dump"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /debug/ [post]
func debugMetrics(rw http.ResponseWriter, r *http.Request) {
raw := r.URL.Query().Get("selector")
rw.Header().Add("Content-Type", "application/json")
selector := []string{}
if len(raw) != 0 {
selector = strings.Split(raw, ":")
}
ms := memorystore.GetMemoryStore()
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
}
// handleHealthCheck godoc
// @summary HealthCheck endpoint
// @tags healthcheck
// @description This endpoint allows the users to check if a node is healthy
// @produce json
// @param selector query string false "Selector"
// @success 200 {string} string "Debug dump"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /healthcheck/ [get]
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
rawCluster := r.URL.Query().Get("cluster")
rawNode := r.URL.Query().Get("node")
if rawCluster == "" || rawNode == "" {
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
selector := []string{rawCluster, rawNode}
ms := memorystore.GetMemoryStore()
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
}

231
internal/api/nats.go Normal file
View File

@@ -0,0 +1,231 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bytes"
"database/sql"
"encoding/json"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/nats"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
)
// NatsAPI provides NATS subscription-based handlers for Job and Node operations.
// It mirrors the functionality of the REST API but uses NATS messaging.
type NatsAPI struct {
JobRepository *repository.JobRepository
// RepositoryMutex protects job creation operations from race conditions
// when checking for duplicate jobs during startJob calls.
RepositoryMutex sync.Mutex
}
// NewNatsAPI creates a new NatsAPI instance with default dependencies.
func NewNatsAPI() *NatsAPI {
return &NatsAPI{
JobRepository: repository.GetJobRepository(),
}
}
// StartSubscriptions registers all NATS subscriptions for Job and Node APIs.
// Returns an error if the NATS client is not available or subscription fails.
func (api *NatsAPI) StartSubscriptions() error {
client := nats.GetClient()
if client == nil {
cclog.Warn("NATS client not available, skipping API subscriptions")
return nil
}
if config.Keys.APISubjects != nil {
s := config.Keys.APISubjects
if err := client.Subscribe(s.SubjectJobStart, api.handleStartJob); err != nil {
return err
}
if err := client.Subscribe(s.SubjectJobStop, api.handleStopJob); err != nil {
return err
}
if err := client.Subscribe(s.SubjectNodeState, api.handleNodeState); err != nil {
return err
}
cclog.Info("NATS API subscriptions started")
}
return nil
}
// handleStartJob processes job start messages received via NATS.
// Expected JSON payload follows the schema.Job structure.
func (api *NatsAPI) handleStartJob(subject string, data []byte) {
req := schema.Job{
Shared: "none",
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
dec := json.NewDecoder(bytes.NewReader(data))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err)
return
}
cclog.Debugf("NATS %s: %s", subject, req.GoString())
req.State = schema.JobStateRunning
if err := importer.SanityChecks(&req); err != nil {
cclog.Errorf("NATS %s: sanity check failed: %v", subject, err)
return
}
var unlockOnce sync.Once
api.RepositoryMutex.Lock()
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
if err != nil && err != sql.ErrNoRows {
cclog.Errorf("NATS %s: checking for duplicate failed: %v", subject, err)
return
}
if err == nil {
for _, job := range jobs {
if (req.StartTime - job.StartTime) < secondsPerDay {
cclog.Errorf("NATS %s: job with jobId %d, cluster %s already exists (dbid: %d)",
subject, req.JobID, req.Cluster, job.ID)
return
}
}
}
id, err := api.JobRepository.Start(&req)
if err != nil {
cclog.Errorf("NATS %s: insert into database failed: %v", subject, err)
return
}
unlockOnce.Do(api.RepositoryMutex.Unlock)
for _, tag := range req.Tags {
if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
cclog.Errorf("NATS %s: adding tag to new job %d failed: %v", subject, id, err)
return
}
}
cclog.Infof("NATS: new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
id, req.Cluster, req.JobID, req.User, req.StartTime)
}
// handleStopJob processes job stop messages received via NATS.
// Expected JSON payload follows the StopJobAPIRequest structure.
func (api *NatsAPI) handleStopJob(subject string, data []byte) {
var req StopJobAPIRequest
dec := json.NewDecoder(bytes.NewReader(data))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err)
return
}
if req.JobID == nil {
cclog.Errorf("NATS %s: the field 'jobId' is required", subject)
return
}
job, err := api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
if err != nil {
cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
if cachedErr != nil {
cclog.Errorf("NATS %s: finding job failed: %v (cached lookup also failed: %v)",
subject, err, cachedErr)
return
}
job = cachedJob
}
if job.State != schema.JobStateRunning {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
subject, job.JobID, job.ID, job.Cluster, job.State)
return
}
if job.StartTime > req.StopTime {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
subject, job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
return
}
if req.State != "" && !req.State.Valid() {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: invalid job state: %#v",
subject, job.JobID, job.ID, job.Cluster, req.State)
return
} else if req.State == "" {
req.State = schema.JobStateCompleted
}
job.Duration = int32(req.StopTime - job.StartTime)
job.State = req.State
api.JobRepository.Mutex.Lock()
defer api.JobRepository.Mutex.Unlock()
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
subject, job.JobID, job.ID, job.Cluster, job.State, err)
return
}
}
cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s",
job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
return
}
archiver.TriggerArchiving(job)
}
// handleNodeState processes node state update messages received via NATS.
// Expected JSON payload follows the UpdateNodeStatesRequest structure.
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
var req UpdateNodeStatesRequest
dec := json.NewDecoder(bytes.NewReader(data))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err)
return
}
repo := repository.GetNodeRepository()
for _, node := range req.Nodes {
state := determineState(node.States)
nodeState := schema.NodeStateDB{
TimeStamp: time.Now().Unix(),
NodeState: state,
CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated,
GpusAllocated: node.GpusAllocated,
HealthState: schema.MonitoringStateFull,
JobsRunning: node.JobsRunning,
}
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
}
cclog.Debugf("NATS %s: updated %d node states for cluster %s", subject, len(req.Nodes), req.Cluster)
}

80
internal/api/node.go Normal file
View File

@@ -0,0 +1,80 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"fmt"
"net/http"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-lib/schema"
)
type UpdateNodeStatesRequest struct {
Nodes []schema.NodePayload `json:"nodes"`
Cluster string `json:"cluster" example:"fritz"`
}
// this routine assumes that only one of them exists per node
func determineState(states []string) schema.SchedulerState {
for _, state := range states {
switch strings.ToLower(state) {
case "allocated":
return schema.NodeStateAllocated
case "reserved":
return schema.NodeStateReserved
case "idle":
return schema.NodeStateIdle
case "down":
return schema.NodeStateDown
case "mixed":
return schema.NodeStateMixed
}
}
return schema.NodeStateUnknown
}
// updateNodeStates godoc
// @summary Deliver updated Slurm node states
// @tags Nodestates
// @description Returns a JSON-encoded list of users.
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
// @success 200 {object} api.DefaultApiResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/nodestats/ [post]
func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
// Parse request body
req := UpdateNodeStatesRequest{}
if err := decode(r.Body, &req); err != nil {
handleError(fmt.Errorf("parsing request body failed: %w", err),
http.StatusBadRequest, rw)
return
}
repo := repository.GetNodeRepository()
for _, node := range req.Nodes {
state := determineState(node.States)
nodeState := schema.NodeStateDB{
TimeStamp: time.Now().Unix(), NodeState: state,
CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated,
GpusAllocated: node.GpusAllocated,
HealthState: schema.MonitoringStateFull,
JobsRunning: node.JobsRunning,
}
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
}
}

File diff suppressed because it is too large Load Diff

221
internal/api/user.go Normal file
View File

@@ -0,0 +1,221 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"encoding/json"
"fmt"
"net/http"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/gorilla/mux"
)
type APIReturnedUser struct {
Username string `json:"username"`
Name string `json:"name"`
Roles []string `json:"roles"`
Email string `json:"email"`
Projects []string `json:"projects"`
}
// getUsers godoc
// @summary Returns a list of users
// @tags User
// @description Returns a JSON-encoded list of users.
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
// @success 200 {array} api.ApiReturnedUser "List of users returned successfully"
// @failure 400 {string} string "Bad Request"
// @failure 401 {string} string "Unauthorized"
// @failure 403 {string} string "Forbidden"
// @failure 500 {string} string "Internal Server Error"
// @security ApiKeyAuth
// @router /api/users/ [get]
func (api *RestAPI) getUsers(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to fetch a list of users"), http.StatusForbidden, rw)
return
}
users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true")
if err != nil {
handleError(fmt.Errorf("listing users failed: %w", err), http.StatusInternalServerError, rw)
return
}
rw.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(rw).Encode(users); err != nil {
cclog.Errorf("Failed to encode users response: %v", err)
}
}
// updateUser godoc
// @summary Update user roles and projects
// @tags User
// @description Allows admins to add/remove roles and projects for a user
// @produce plain
// @param id path string true "Username"
// @param add-role formData string false "Role to add"
// @param remove-role formData string false "Role to remove"
// @param add-project formData string false "Project to add"
// @param remove-project formData string false "Project to remove"
// @success 200 {string} string "Success message"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
// @security ApiKeyAuth
// @router /api/user/{id} [post]
func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to update a user"), http.StatusForbidden, rw)
return
}
// Get Values
newrole := r.FormValue("add-role")
delrole := r.FormValue("remove-role")
newproj := r.FormValue("add-project")
delproj := r.FormValue("remove-project")
rw.Header().Set("Content-Type", "application/json")
// Handle role updates
if newrole != "" {
if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Add Role Success"}); err != nil {
cclog.Errorf("Failed to encode response: %v", err)
}
} else if delrole != "" {
if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil {
handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Remove Role Success"}); err != nil {
cclog.Errorf("Failed to encode response: %v", err)
}
} else if newproj != "" {
if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil {
handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Add Project Success"}); err != nil {
cclog.Errorf("Failed to encode response: %v", err)
}
} else if delproj != "" {
if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil {
handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Remove Project Success"}); err != nil {
cclog.Errorf("Failed to encode response: %v", err)
}
} else {
handleError(fmt.Errorf("no operation specified: must provide add-role, remove-role, add-project, or remove-project"), http.StatusBadRequest, rw)
}
}
// createUser godoc
// @summary Create a new user
// @tags User
// @description Creates a new user with specified credentials and role
// @produce plain
// @param username formData string true "Username"
// @param password formData string false "Password (not required for API users)"
// @param role formData string true "User role"
// @param name formData string false "Full name"
// @param email formData string false "Email address"
// @param project formData string false "Project (required for managers)"
// @success 200 {string} string "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
// @security ApiKeyAuth
// @router /api/users/ [post]
func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
rw.Header().Set("Content-Type", "text/plain")
me := repository.GetUserFromContext(r.Context())
if !me.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to create new users"), http.StatusForbidden, rw)
return
}
username, password, role, name, email, project := r.FormValue("username"),
r.FormValue("password"), r.FormValue("role"), r.FormValue("name"),
r.FormValue("email"), r.FormValue("project")
// Validate username length
if len(username) == 0 || len(username) > 100 {
handleError(fmt.Errorf("username must be between 1 and 100 characters"), http.StatusBadRequest, rw)
return
}
if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) {
handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw)
return
}
if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) {
handleError(fmt.Errorf("only managers require a project (can be changed later)"), http.StatusBadRequest, rw)
return
} else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) {
handleError(fmt.Errorf("managers require a project to manage (can be changed later)"), http.StatusBadRequest, rw)
return
}
if err := repository.GetUserRepository().AddUser(&schema.User{
Username: username,
Name: name,
Password: password,
Email: email,
Projects: []string{project},
Roles: []string{role},
}); err != nil {
handleError(fmt.Errorf("adding user failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
fmt.Fprintf(rw, "User %v successfully created!\n", username)
}
// deleteUser godoc
// @summary Delete a user
// @tags User
// @description Deletes a user from the system
// @produce plain
// @param username formData string true "Username to delete"
// @success 200 {string} string "Success"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
// @security ApiKeyAuth
// @router /api/users/ [delete]
func (api *RestAPI) deleteUser(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to delete a user"), http.StatusForbidden, rw)
return
}
username := r.FormValue("username")
if err := repository.GetUserRepository().DelUser(username); err != nil {
handleError(fmt.Errorf("deleting user failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
rw.WriteHeader(http.StatusOK)
}

190
internal/archiver/README.md Normal file
View File

@@ -0,0 +1,190 @@
# Archiver Package
The `archiver` package provides asynchronous job archiving functionality for ClusterCockpit. When jobs complete, their metric data is archived from the metric store to a persistent archive backend (filesystem, S3, SQLite, etc.).
## Architecture
### Producer-Consumer Pattern
```
┌──────────────┐ TriggerArchiving() ┌───────────────┐
│ API Handler │ ───────────────────────▶ │ archiveChannel│
│ (Job Stop) │ │ (buffer: 128)│
└──────────────┘ └───────┬───────┘
┌─────────────────────────────────┘
┌──────────────────────┐
│ archivingWorker() │
│ (goroutine) │
└──────────┬───────────┘
1. Fetch job metadata
2. Load metric data
3. Calculate statistics
4. Archive to backend
5. Update database
6. Call hooks
```
### Components
- **archiveChannel**: Buffered channel (128 jobs) for async communication
- **archivePending**: WaitGroup tracking in-flight archiving operations
- **archivingWorker**: Background goroutine processing archiving requests
- **shutdownCtx**: Context for graceful cancellation during shutdown
## Usage
### Initialization
```go
// Start archiver with context for shutdown control
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
archiver.Start(jobRepository, ctx)
```
### Archiving a Job
```go
// Called automatically when a job completes
archiver.TriggerArchiving(job)
```
The function returns immediately. Actual archiving happens in the background.
### Graceful Shutdown
```go
// Shutdown with 10 second timeout
if err := archiver.Shutdown(10 * time.Second); err != nil {
log.Printf("Archiver shutdown timeout: %v", err)
}
```
**Shutdown process:**
1. Closes channel (rejects new jobs)
2. Waits for pending jobs (up to timeout)
3. Cancels context if timeout exceeded
4. Waits for worker to exit cleanly
## Configuration
### Channel Buffer Size
The archiving channel has a buffer of 128 jobs. If more than 128 jobs are queued simultaneously, `TriggerArchiving()` will block until space is available.
To adjust:
```go
// In archiveWorker.go Start() function
archiveChannel = make(chan *schema.Job, 256) // Increase buffer
```
### Scope Selection
Archive data scopes are automatically selected based on job size:
- **Node scope**: Always included
- **Core scope**: Included for jobs with ≤8 nodes (reduces data volume for large jobs)
- **Accelerator scope**: Included if job used accelerators (`NumAcc > 0`)
To adjust the node threshold:
```go
// In archiver.go ArchiveJob() function
if job.NumNodes <= 16 { // Change from 8 to 16
scopes = append(scopes, schema.MetricScopeCore)
}
```
### Resolution
Data is archived at the highest available resolution (typically 60s intervals). To change:
```go
// In archiver.go ArchiveJob() function
jobData, err := metricdispatcher.LoadData(job, allMetrics, scopes, ctx, 300)
// 0 = highest resolution
// 300 = 5-minute resolution
```
## Error Handling
### Automatic Retry
The archiver does **not** automatically retry failed archiving operations. If archiving fails:
1. Error is logged
2. Job is marked as `MonitoringStatusArchivingFailed` in database
3. Worker continues processing other jobs
### Manual Retry
To re-archive failed jobs, query for jobs with `MonitoringStatusArchivingFailed` and call `TriggerArchiving()` again.
## Performance Considerations
### Single Worker Thread
The archiver uses a single worker goroutine. For high-throughput systems:
- Large channel buffer (128) prevents blocking
- Archiving is typically I/O bound (writing to storage)
- Single worker prevents overwhelming storage backend
### Shutdown Timeout
Recommended timeout values:
- **Development**: 5-10 seconds
- **Production**: 10-30 seconds
- **High-load**: 30-60 seconds
Choose based on:
- Average archiving time per job
- Storage backend latency
- Acceptable shutdown delay
## Monitoring
### Logging
The archiver logs:
- **Info**: Startup, shutdown, successful completions
- **Debug**: Individual job archiving times
- **Error**: Archiving failures with job ID and reason
- **Warn**: Shutdown timeout exceeded
### Metrics
Monitor these signals for archiver health:
- Jobs with `MonitoringStatusArchivingFailed`
- Time from job stop to successful archive
- Shutdown timeout occurrences
## Thread Safety
All exported functions are safe for concurrent use:
- `Start()` - Safe to call once
- `TriggerArchiving()` - Safe from multiple goroutines
- `Shutdown()` - Safe to call once
- `WaitForArchiving()` - Deprecated, but safe
Internal state is protected by:
- Channel synchronization (`archiveChannel`)
- WaitGroup for pending count (`archivePending`)
- Context for cancellation (`shutdownCtx`)
## Files
- **archiveWorker.go**: Worker lifecycle, channel management, shutdown logic
- **archiver.go**: Core archiving logic, metric loading, statistics calculation
## Dependencies
- `internal/repository`: Database operations for job metadata
- `internal/metricdispatcher`: Loading metric data from various backends
- `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
- `cc-lib/schema`: Job and metric data structures

View File

@@ -0,0 +1,250 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package archiver provides asynchronous job archiving functionality for ClusterCockpit.
//
// The archiver runs a background worker goroutine that processes job archiving requests
// from a buffered channel. When jobs complete, their metric data is archived from the
// metric store to the configured archive backend (filesystem, S3, etc.).
//
// # Architecture
//
// The archiver uses a producer-consumer pattern:
// - Producer: TriggerArchiving() sends jobs to archiveChannel
// - Consumer: archivingWorker() processes jobs from the channel
// - Coordination: sync.WaitGroup tracks pending archive operations
//
// # Lifecycle
//
// 1. Start(repo, ctx) - Initialize worker with context for cancellation
// 2. TriggerArchiving(job) - Queue job for archiving (called when job stops)
// 3. archivingWorker() - Background goroutine processes jobs
// 4. Shutdown(timeout) - Graceful shutdown with timeout
//
// # Graceful Shutdown
//
// The archiver supports graceful shutdown with configurable timeout:
// - Closes channel to reject new jobs
// - Waits for pending jobs to complete (up to timeout)
// - Cancels context if timeout exceeded
// - Ensures worker goroutine exits cleanly
//
// # Example Usage
//
// // Initialize archiver
// ctx, cancel := context.WithCancel(context.Background())
// defer cancel()
// archiver.Start(jobRepository, ctx)
//
// // Trigger archiving when job completes
// archiver.TriggerArchiving(job)
//
// // Graceful shutdown with 10 second timeout
// if err := archiver.Shutdown(10 * time.Second); err != nil {
// log.Printf("Archiver shutdown timeout: %v", err)
// }
package archiver
import (
"context"
"fmt"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
sq "github.com/Masterminds/squirrel"
)
var (
archivePending sync.WaitGroup
archiveChannel chan *schema.Job
jobRepo *repository.JobRepository
shutdownCtx context.Context
shutdownCancel context.CancelFunc
workerDone chan struct{}
)
// Start initializes the archiver and starts the background worker goroutine.
//
// The archiver processes job archiving requests asynchronously via a buffered channel.
// Jobs are sent to the channel using TriggerArchiving() and processed by the worker.
//
// Parameters:
// - r: JobRepository instance for database operations
// - ctx: Context for cancellation (shutdown signal propagation)
//
// The worker goroutine will run until:
// - ctx is cancelled (via parent shutdown)
// - archiveChannel is closed (via Shutdown())
//
// Must be called before TriggerArchiving(). Safe to call only once.
func Start(r *repository.JobRepository, ctx context.Context) {
shutdownCtx, shutdownCancel = context.WithCancel(ctx)
archiveChannel = make(chan *schema.Job, 128)
workerDone = make(chan struct{})
jobRepo = r
go archivingWorker()
}
// archivingWorker is the background goroutine that processes job archiving requests.
//
// The worker loop:
// 1. Blocks waiting for jobs on archiveChannel or shutdown signal
// 2. Fetches job metadata from repository
// 3. Archives job data to configured backend (calls ArchiveJob)
// 4. Updates job footprint and energy metrics in database
// 5. Marks job as successfully archived
// 6. Calls job stop hooks
//
// The worker exits when:
// - shutdownCtx is cancelled (timeout during shutdown)
// - archiveChannel is closed (normal shutdown)
//
// Errors during archiving are logged and the job is marked as failed,
// but the worker continues processing other jobs.
func archivingWorker() {
defer close(workerDone)
for {
select {
case <-shutdownCtx.Done():
cclog.Info("Archive worker received shutdown signal")
return
case job, ok := <-archiveChannel:
if !ok {
cclog.Info("Archive channel closed, worker exiting")
return
}
start := time.Now()
// not using meta data, called to load JobMeta into Cache?
// will fail if job meta not in repository
if _, err := jobRepo.FetchMetadata(job); err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
archivePending.Done()
continue
}
// ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
// Use shutdown context to allow cancellation
jobMeta, err := ArchiveJob(job, shutdownCtx)
if err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
archivePending.Done()
continue
}
stmt := sq.Update("job").Where("job.id = ?", job.ID)
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
archivePending.Done()
continue
}
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
archivePending.Done()
continue
}
// Update the jobs database entry one last time:
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
if err := jobRepo.Execute(stmt); err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
archivePending.Done()
continue
}
cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
cclog.Infof("archiving job (dbid: %d) successful", job.ID)
repository.CallJobStopHooks(job)
archivePending.Done()
}
}
}
// TriggerArchiving queues a job for asynchronous archiving.
//
// This function should be called when a job completes (stops) to archive its
// metric data from the metric store to the configured archive backend.
//
// The function:
// 1. Increments the pending job counter (WaitGroup)
// 2. Sends the job to the archiving channel (buffered, capacity 128)
// 3. Returns immediately (non-blocking unless channel is full)
//
// The actual archiving is performed asynchronously by the worker goroutine.
// Upon completion, the worker will decrement the pending counter.
//
// Panics if Start() has not been called first.
func TriggerArchiving(job *schema.Job) {
if archiveChannel == nil {
cclog.Fatal("Cannot archive without archiving channel. Did you Start the archiver?")
}
archivePending.Add(1)
archiveChannel <- job
}
// Shutdown performs a graceful shutdown of the archiver with a configurable timeout.
//
// The shutdown process:
// 1. Closes archiveChannel - no new jobs will be accepted
// 2. Waits for pending jobs to complete (up to timeout duration)
// 3. If timeout is exceeded:
// - Cancels shutdownCtx to interrupt ongoing ArchiveJob operations
// - Returns error indicating timeout
// 4. Waits for worker goroutine to exit cleanly
//
// Parameters:
// - timeout: Maximum duration to wait for pending jobs to complete
// (recommended: 10-30 seconds for production)
//
// Returns:
// - nil if all jobs completed within timeout
// - error if timeout was exceeded (some jobs may not have been archived)
//
// Jobs that don't complete within the timeout will be marked as failed.
// The function always ensures the worker goroutine exits before returning.
//
// Example:
//
// if err := archiver.Shutdown(10 * time.Second); err != nil {
// log.Printf("Some jobs did not complete: %v", err)
// }
func Shutdown(timeout time.Duration) error {
cclog.Info("Initiating archiver shutdown...")
// Close channel to signal no more jobs will be accepted
close(archiveChannel)
// Create a channel to signal when all jobs are done
done := make(chan struct{})
go func() {
archivePending.Wait()
close(done)
}()
// Wait for jobs to complete or timeout
select {
case <-done:
cclog.Info("All archive jobs completed successfully")
// Wait for worker to exit
<-workerDone
return nil
case <-time.After(timeout):
cclog.Warn("Archiver shutdown timeout exceeded, cancelling remaining operations")
// Cancel any ongoing operations
shutdownCancel()
// Wait for worker to exit
<-workerDone
return fmt.Errorf("archiver shutdown timeout after %v", timeout)
}
}

View File

@@ -0,0 +1,105 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package archiver
import (
"context"
"math"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
)
// ArchiveJob archives a completed job's metric data to the configured archive backend.
//
// This function performs the following operations:
// 1. Loads all metric data for the job from the metric data repository
// 2. Calculates job-level statistics (avg, min, max) for each metric
// 3. Stores the job metadata and metric data to the archive backend
//
// Metric data is retrieved at the highest available resolution (typically 60s)
// for the following scopes:
// - Node scope (always)
// - Core scope (for jobs with ≤8 nodes, to reduce data volume)
// - Accelerator scope (if job used accelerators)
//
// The function respects context cancellation. If ctx is cancelled (e.g., during
// shutdown timeout), the operation will be interrupted and return an error.
//
// Parameters:
// - job: The job to archive (must be a completed job)
// - ctx: Context for cancellation and timeout control
//
// Returns:
// - *schema.Job with populated Statistics field
// - error if data loading or archiving fails
//
// If config.Keys.DisableArchive is true, only job statistics are calculated
// and returned (no data is written to archive backend).
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
allMetrics := make([]string, 0)
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name)
}
scopes := []schema.MetricScope{schema.MetricScopeNode}
// FIXME: Add a config option for this
if job.NumNodes <= 8 {
// This will add the native scope if core scope is not available
scopes = append(scopes, schema.MetricScopeCore)
}
if job.NumAcc > 0 {
scopes = append(scopes, schema.MetricScopeAccelerator)
}
jobData, err := metricdispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
if err != nil {
cclog.Error("Error wile loading job data for archiving")
return nil, err
}
job.Statistics = make(map[string]schema.JobStatistics)
for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
nodeData, ok := data["node"]
if !ok {
// This should never happen ?
continue
}
for _, series := range nodeData.Series {
avg += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
}
// Round AVG Result to 2 Digits
job.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
},
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
Min: min,
Max: max,
}
}
// If the file based archive is disabled,
// only return the JobMeta structure as the
// statistics in there are needed.
if config.Keys.DisableArchive {
return job, nil
}
return job, archive.GetHandle().ImportJob(job, &jobData)
}

View File

@@ -1,47 +1,137 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package auth implements various authentication methods
package auth
import (
"bytes"
"context"
"crypto/rand"
"database/sql"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"net"
"net/http"
"os"
"sync"
"time"
"golang.org/x/time/rate"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
"github.com/gorilla/sessions"
)
// Authenticator is the interface for all authentication methods.
// Each authenticator determines if it can handle a login request (CanLogin)
// and performs the actual authentication (Login).
type Authenticator interface {
// CanLogin determines if this authenticator can handle the login request.
// It returns the user object if available and a boolean indicating if this
// authenticator should attempt the login. This method should not perform
// expensive operations or actual authentication.
CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool)
// Login performs the actually authentication for the user.
// It returns the authenticated user or an error if authentication fails.
// The user parameter may be nil if the user doesn't exist in the database yet.
Login(user *schema.User, rw http.ResponseWriter, r *http.Request) (*schema.User, error)
}
type Authentication struct {
sessionStore *sessions.CookieStore
SessionMaxAge time.Duration
var (
initOnce sync.Once
authInstance *Authentication
)
authenticators []Authenticator
// rateLimiterEntry tracks a rate limiter and its last use time for cleanup
type rateLimiterEntry struct {
limiter *rate.Limiter
lastUsed time.Time
}
var ipUserLimiters sync.Map
// getIPUserLimiter returns a rate limiter for the given IP and username combination.
// Rate limiters are created on demand and track 5 attempts per 15 minutes.
func getIPUserLimiter(ip, username string) *rate.Limiter {
key := ip + ":" + username
now := time.Now()
if entry, ok := ipUserLimiters.Load(key); ok {
rle := entry.(*rateLimiterEntry)
rle.lastUsed = now
return rle.limiter
}
// More aggressive rate limiting: 5 attempts per 15 minutes
newLimiter := rate.NewLimiter(rate.Every(15*time.Minute/5), 5)
ipUserLimiters.Store(key, &rateLimiterEntry{
limiter: newLimiter,
lastUsed: now,
})
return newLimiter
}
// cleanupOldRateLimiters removes rate limiters that haven't been used recently
func cleanupOldRateLimiters(olderThan time.Time) {
ipUserLimiters.Range(func(key, value any) bool {
entry := value.(*rateLimiterEntry)
if entry.lastUsed.Before(olderThan) {
ipUserLimiters.Delete(key)
cclog.Debugf("Cleaned up rate limiter for %v", key)
}
return true
})
}
// startRateLimiterCleanup starts a background goroutine to clean up old rate limiters
func startRateLimiterCleanup() {
go func() {
ticker := time.NewTicker(1 * time.Hour)
defer ticker.Stop()
for range ticker.C {
// Clean up limiters not used in the last 24 hours
cleanupOldRateLimiters(time.Now().Add(-24 * time.Hour))
}
}()
}
// AuthConfig contains configuration for all authentication methods
type AuthConfig struct {
LdapConfig *LdapConfig `json:"ldap"`
JwtConfig *JWTAuthConfig `json:"jwts"`
OpenIDConfig *OpenIDConfig `json:"oidc"`
}
// Keys holds the global authentication configuration
var Keys AuthConfig
// Authentication manages all authentication methods and session handling
type Authentication struct {
sessionStore *sessions.CookieStore
LdapAuth *LdapAuthenticator
JwtAuth *JWTAuthenticator
LocalAuth *LocalAuthenticator
authenticators []Authenticator
SessionMaxAge time.Duration
}
func (auth *Authentication) AuthViaSession(
rw http.ResponseWriter,
r *http.Request) (*schema.User, error) {
r *http.Request,
) (*schema.User, error) {
session, err := auth.sessionStore.Get(r, "session")
if err != nil {
log.Error("Error while getting session store")
cclog.Error("Error while getting session store")
return nil, err
}
@@ -49,10 +139,31 @@ func (auth *Authentication) AuthViaSession(
return nil, nil
}
// TODO: Check if session keys exist
username, _ := session.Values["username"].(string)
projects, _ := session.Values["projects"].([]string)
roles, _ := session.Values["roles"].([]string)
// Validate session data with proper type checking
username, ok := session.Values["username"].(string)
if !ok || username == "" {
cclog.Warn("Invalid session: missing or invalid username")
// Invalidate the corrupted session
session.Options.MaxAge = -1
_ = auth.sessionStore.Save(r, rw, session)
return nil, errors.New("invalid session data")
}
projects, ok := session.Values["projects"].([]string)
if !ok {
cclog.Warn("Invalid session: projects not found or invalid type, using empty list")
projects = []string{}
}
roles, ok := session.Values["roles"].([]string)
if !ok || len(roles) == 0 {
cclog.Warn("Invalid session: missing or invalid roles")
// Invalidate the corrupted session
session.Options.MaxAge = -1
_ = auth.sessionStore.Save(r, rw, session)
return nil, errors.New("invalid session data")
}
return &schema.User{
Username: username,
Projects: projects,
@@ -62,86 +173,179 @@ func (auth *Authentication) AuthViaSession(
}, nil
}
func Init() (*Authentication, error) {
auth := &Authentication{}
func Init(authCfg *json.RawMessage) {
initOnce.Do(func() {
authInstance = &Authentication{}
// Start background cleanup of rate limiters
startRateLimiterCleanup()
sessKey := os.Getenv("SESSION_KEY")
if sessKey == "" {
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
bytes := make([]byte, 32)
if _, err := rand.Read(bytes); err != nil {
log.Error("Error while initializing authentication -> failed to generate random bytes for session key")
return nil, err
}
auth.sessionStore = sessions.NewCookieStore(bytes)
} else {
bytes, err := base64.StdEncoding.DecodeString(sessKey)
if err != nil {
log.Error("Error while initializing authentication -> decoding session key failed")
return nil, err
}
auth.sessionStore = sessions.NewCookieStore(bytes)
}
if config.Keys.LdapConfig != nil {
ldapAuth := &LdapAuthenticator{}
if err := ldapAuth.Init(); err != nil {
log.Warn("Error while initializing authentication -> ldapAuth init failed")
sessKey := os.Getenv("SESSION_KEY")
if sessKey == "" {
cclog.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
bytes := make([]byte, 32)
if _, err := rand.Read(bytes); err != nil {
cclog.Fatal("Error while initializing authentication -> failed to generate random bytes for session key")
}
authInstance.sessionStore = sessions.NewCookieStore(bytes)
} else {
auth.LdapAuth = ldapAuth
auth.authenticators = append(auth.authenticators, auth.LdapAuth)
}
} else {
log.Info("Missing LDAP configuration: No LDAP support!")
}
if config.Keys.JwtConfig != nil {
auth.JwtAuth = &JWTAuthenticator{}
if err := auth.JwtAuth.Init(); err != nil {
log.Error("Error while initializing authentication -> jwtAuth init failed")
return nil, err
bytes, err := base64.StdEncoding.DecodeString(sessKey)
if err != nil {
cclog.Fatal("Error while initializing authentication -> decoding session key failed")
}
authInstance.sessionStore = sessions.NewCookieStore(bytes)
}
jwtSessionAuth := &JWTSessionAuthenticator{}
if err := jwtSessionAuth.Init(); err != nil {
log.Info("jwtSessionAuth init failed: No JWT login support!")
if d, err := time.ParseDuration(config.Keys.SessionMaxAge); err == nil {
authInstance.SessionMaxAge = d
}
if authCfg == nil {
return
}
config.Validate(configSchema, *authCfg)
dec := json.NewDecoder(bytes.NewReader(*authCfg))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Errorf("error while decoding ldap config: %v", err)
}
if Keys.LdapConfig != nil {
ldapAuth := &LdapAuthenticator{}
if err := ldapAuth.Init(); err != nil {
cclog.Warn("Error while initializing authentication -> ldapAuth init failed")
} else {
authInstance.LdapAuth = ldapAuth
authInstance.authenticators = append(authInstance.authenticators, authInstance.LdapAuth)
}
} else {
auth.authenticators = append(auth.authenticators, jwtSessionAuth)
cclog.Info("Missing LDAP configuration: No LDAP support!")
}
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
if err := jwtCookieSessionAuth.Init(); err != nil {
log.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
if Keys.JwtConfig != nil {
authInstance.JwtAuth = &JWTAuthenticator{}
if err := authInstance.JwtAuth.Init(); err != nil {
cclog.Fatal("Error while initializing authentication -> jwtAuth init failed")
}
jwtSessionAuth := &JWTSessionAuthenticator{}
if err := jwtSessionAuth.Init(); err != nil {
cclog.Info("jwtSessionAuth init failed: No JWT login support!")
} else {
authInstance.authenticators = append(authInstance.authenticators, jwtSessionAuth)
}
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
if err := jwtCookieSessionAuth.Init(); err != nil {
cclog.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
} else {
authInstance.authenticators = append(authInstance.authenticators, jwtCookieSessionAuth)
}
} else {
auth.authenticators = append(auth.authenticators, jwtCookieSessionAuth)
cclog.Info("Missing JWT configuration: No JWT token support!")
}
} else {
log.Info("Missing JWT configuration: No JWT token support!")
authInstance.LocalAuth = &LocalAuthenticator{}
if err := authInstance.LocalAuth.Init(); err != nil {
cclog.Fatal("Error while initializing authentication -> localAuth init failed")
}
authInstance.authenticators = append(authInstance.authenticators, authInstance.LocalAuth)
})
}
func GetAuthInstance() *Authentication {
if authInstance == nil {
cclog.Fatal("Authentication module not initialized!")
}
auth.LocalAuth = &LocalAuthenticator{}
if err := auth.LocalAuth.Init(); err != nil {
log.Error("Error while initializing authentication -> localAuth init failed")
return nil, err
}
auth.authenticators = append(auth.authenticators, auth.LocalAuth)
return authInstance
}
return auth, nil
// handleUserSync syncs or updates a user in the database based on configuration.
// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled.
func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) {
r := repository.GetUserRepository()
dbUser, err := r.GetUser(user.Username)
if err != nil && err != sql.ErrNoRows {
cclog.Errorf("Error while loading user '%s': %v", user.Username, err)
return
}
if err == sql.ErrNoRows && syncUserOnLogin { // Add new user
if err := r.AddUser(user); err != nil {
cclog.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
}
} else if err == nil && updateUserOnLogin { // Update existing user
if err := r.UpdateUser(dbUser, user); err != nil {
cclog.Errorf("Error while updating user '%s' in DB: %v", dbUser.Username, err)
}
}
}
// handleTokenUser syncs JWT token user with database
func handleTokenUser(tokenUser *schema.User) {
handleUserSync(tokenUser, Keys.JwtConfig.SyncUserOnLogin, Keys.JwtConfig.UpdateUserOnLogin)
}
// handleOIDCUser syncs OIDC user with database
func handleOIDCUser(OIDCUser *schema.User) {
handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin)
}
func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
session, err := auth.sessionStore.New(r, "session")
if err != nil {
cclog.Errorf("session creation failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return err
}
if auth.SessionMaxAge != 0 {
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
}
if config.Keys.HTTPSCertFile == "" && config.Keys.HTTPSKeyFile == "" {
cclog.Warn("HTTPS not configured - session cookies will not have Secure flag set (insecure for production)")
session.Options.Secure = false
}
session.Options.SameSite = http.SameSiteStrictMode
session.Values["username"] = user.Username
session.Values["projects"] = user.Projects
session.Values["roles"] = user.Roles
if err := auth.sessionStore.Save(r, rw, session); err != nil {
cclog.Warnf("session save failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return err
}
return nil
}
func (auth *Authentication) Login(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error)) http.Handler {
onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
username := r.FormValue("username")
var dbUser *schema.User
ip, _, err := net.SplitHostPort(r.RemoteAddr)
if err != nil {
ip = r.RemoteAddr
}
username := r.FormValue("username")
limiter := getIPUserLimiter(ip, username)
if !limiter.Allow() {
cclog.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username)
onfailure(rw, r, errors.New("too many login attempts, try again in a few minutes"))
return
}
var dbUser *schema.User
if username != "" {
var err error
dbUser, err = repository.GetUserRepository().GetUser(username)
if err != nil && err != sql.ErrNoRows {
log.Errorf("Error while loading user '%v'", username)
cclog.Errorf("Error while loading user '%v'", username)
}
}
@@ -151,79 +355,224 @@ func (auth *Authentication) Login(
if user, ok = authenticator.CanLogin(dbUser, username, rw, r); !ok {
continue
} else {
log.Debugf("Can login with user %v", user)
cclog.Debugf("Can login with user %v", user)
}
user, err := authenticator.Login(user, rw, r)
if err != nil {
log.Warnf("user login failed: %s", err.Error())
cclog.Warnf("user login failed: %s", err.Error())
onfailure(rw, r, err)
return
}
session, err := auth.sessionStore.New(r, "session")
if err != nil {
log.Errorf("session creation failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
if err := auth.SaveSession(rw, r, user); err != nil {
return
}
if auth.SessionMaxAge != 0 {
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
}
session.Values["username"] = user.Username
session.Values["projects"] = user.Projects
session.Values["roles"] = user.Roles
if err := auth.sessionStore.Save(r, rw, session); err != nil {
log.Warnf("session save failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
if r.FormValue("redirect") != "" {
http.RedirectHandler(r.FormValue("redirect"), http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
return
}
http.RedirectHandler("/", http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
return
}
log.Debugf("login failed: no authenticator applied")
cclog.Debugf("login failed: no authenticator applied")
onfailure(rw, r, errors.New("no authenticator applied"))
})
}
func (auth *Authentication) Auth(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error)) http.Handler {
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
log.Infof("authentication failed: %s", err.Error())
cclog.Infof("auth -> authentication failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
if user == nil {
user, err = auth.AuthViaSession(rw, r)
if err != nil {
log.Infof("authentication failed: %s", err.Error())
cclog.Infof("auth -> authentication failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
}
if user != nil {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
log.Debug("authentication failed")
cclog.Info("auth -> authentication failed")
onfailure(rw, r, errors.New("unauthorized (please login first)"))
})
}
func (auth *Authentication) AuthAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
cclog.Infof("auth api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
ipErr := securedCheck(user, r)
if ipErr != nil {
cclog.Infof("auth api -> secured check failed: %s", ipErr.Error())
onfailure(rw, r, ipErr)
return
}
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
default:
cclog.Info("auth api -> authentication failed: missing role")
onfailure(rw, r, errors.New("unauthorized"))
}
}
cclog.Info("auth api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthUserAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
cclog.Infof("auth user api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
default:
cclog.Info("auth user api -> authentication failed: missing role")
onfailure(rw, r, errors.New("unauthorized"))
}
}
cclog.Info("auth user api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthMetricStoreAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
cclog.Infof("auth metricstore api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
default:
cclog.Info("auth metricstore api -> authentication failed: missing role")
onfailure(rw, r, errors.New("unauthorized"))
}
}
cclog.Info("auth metricstore api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthConfigAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.AuthViaSession(rw, r)
if err != nil {
cclog.Infof("auth config api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
if user != nil && user.HasRole(schema.RoleAdmin) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
cclog.Info("auth config api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthFrontendAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.AuthViaSession(rw, r)
if err != nil {
cclog.Infof("auth frontend api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
if user != nil {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
cclog.Info("auth frontend api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
session, err := auth.sessionStore.Get(r, "session")
@@ -243,3 +592,42 @@ func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
onsuccess.ServeHTTP(rw, r)
})
}
// Helper Moved To MiddleWare Auth Handlers
func securedCheck(user *schema.User, r *http.Request) error {
if user == nil {
return fmt.Errorf("no user for secured check")
}
// extract IP address for checking
IPAddress := r.Header.Get("X-Real-Ip")
if IPAddress == "" {
IPAddress = r.Header.Get("X-Forwarded-For")
}
if IPAddress == "" {
IPAddress = r.RemoteAddr
}
// Handle both IPv4 and IPv6 addresses properly
// For IPv6, this will strip the port and brackets
// For IPv4, this will strip the port
if host, _, err := net.SplitHostPort(IPAddress); err == nil {
IPAddress = host
}
// If SplitHostPort fails, IPAddress is already just a host (no port)
// If nothing declared in config: deny all request to this api endpoint
if len(config.Keys.APIAllowedIPs) == 0 {
return fmt.Errorf("missing configuration key ApiAllowedIPs")
}
// If wildcard declared in config: Continue
if config.Keys.APIAllowedIPs[0] == "*" {
return nil
}
// check if IP is allowed
if !util.Contains(config.Keys.APIAllowedIPs, IPAddress) {
return fmt.Errorf("unknown ip: %v", IPAddress)
}
return nil
}

176
internal/auth/auth_test.go Normal file
View File

@@ -0,0 +1,176 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"net"
"testing"
"time"
)
// TestGetIPUserLimiter tests the rate limiter creation and retrieval
func TestGetIPUserLimiter(t *testing.T) {
ip := "192.168.1.1"
username := "testuser"
// Get limiter for the first time
limiter1 := getIPUserLimiter(ip, username)
if limiter1 == nil {
t.Fatal("Expected limiter to be created")
}
// Get the same limiter again
limiter2 := getIPUserLimiter(ip, username)
if limiter1 != limiter2 {
t.Error("Expected to get the same limiter instance")
}
// Get a different limiter for different user
limiter3 := getIPUserLimiter(ip, "otheruser")
if limiter1 == limiter3 {
t.Error("Expected different limiter for different user")
}
// Get a different limiter for different IP
limiter4 := getIPUserLimiter("192.168.1.2", username)
if limiter1 == limiter4 {
t.Error("Expected different limiter for different IP")
}
}
// TestRateLimiterBehavior tests that rate limiting works correctly
func TestRateLimiterBehavior(t *testing.T) {
ip := "10.0.0.1"
username := "ratelimituser"
limiter := getIPUserLimiter(ip, username)
// Should allow first 5 attempts
for i := 0; i < 5; i++ {
if !limiter.Allow() {
t.Errorf("Request %d should be allowed within rate limit", i+1)
}
}
// 6th attempt should be blocked
if limiter.Allow() {
t.Error("Request 6 should be blocked by rate limiter")
}
}
// TestCleanupOldRateLimiters tests the cleanup function
func TestCleanupOldRateLimiters(t *testing.T) {
// Clear all existing limiters first to avoid interference from other tests
cleanupOldRateLimiters(time.Now().Add(24 * time.Hour))
// Create some new rate limiters
limiter1 := getIPUserLimiter("1.1.1.1", "user1")
limiter2 := getIPUserLimiter("2.2.2.2", "user2")
if limiter1 == nil || limiter2 == nil {
t.Fatal("Failed to create test limiters")
}
// Cleanup limiters older than 1 second from now (should keep both)
time.Sleep(10 * time.Millisecond) // Small delay to ensure timestamp difference
cleanupOldRateLimiters(time.Now().Add(-1 * time.Second))
// Verify they still exist (should get same instance)
if getIPUserLimiter("1.1.1.1", "user1") != limiter1 {
t.Error("Limiter 1 was incorrectly cleaned up")
}
if getIPUserLimiter("2.2.2.2", "user2") != limiter2 {
t.Error("Limiter 2 was incorrectly cleaned up")
}
// Cleanup limiters older than 1 hour from now (should remove both)
cleanupOldRateLimiters(time.Now().Add(2 * time.Hour))
// Getting them again should create new instances
newLimiter1 := getIPUserLimiter("1.1.1.1", "user1")
if newLimiter1 == limiter1 {
t.Error("Old limiter should have been cleaned up")
}
}
// TestIPv4Extraction tests extracting IPv4 addresses
func TestIPv4Extraction(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{"IPv4 with port", "192.168.1.1:8080", "192.168.1.1"},
{"IPv4 without port", "192.168.1.1", "192.168.1.1"},
{"Localhost with port", "127.0.0.1:3000", "127.0.0.1"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.input
if host, _, err := net.SplitHostPort(result); err == nil {
result = host
}
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
})
}
}
// TestIPv6Extraction tests extracting IPv6 addresses
func TestIPv6Extraction(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{"IPv6 with port", "[2001:db8::1]:8080", "2001:db8::1"},
{"IPv6 localhost with port", "[::1]:3000", "::1"},
{"IPv6 without port", "2001:db8::1", "2001:db8::1"},
{"IPv6 localhost", "::1", "::1"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.input
if host, _, err := net.SplitHostPort(result); err == nil {
result = host
}
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
})
}
}
// TestIPExtractionEdgeCases tests edge cases for IP extraction
func TestIPExtractionEdgeCases(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{"Hostname without port", "example.com", "example.com"},
{"Empty string", "", ""},
{"Just port", ":8080", ""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.input
if host, _, err := net.SplitHostPort(result); err == nil {
result = host
}
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
})
}
}

View File

@@ -1,7 +1,8 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
@@ -13,13 +14,33 @@ import (
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/golang-jwt/jwt/v4"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/golang-jwt/jwt/v5"
)
type JWTAuthConfig struct {
// Specifies for how long a JWT token shall be valid
// as a string parsable by time.ParseDuration().
MaxAge string `json:"max-age"`
// Specifies which cookie should be checked for a JWT token (if no authorization header is present)
CookieName string `json:"cookieName"`
// Deny login for users not in database (but defined in JWT).
// Ignore user roles defined in JWTs ('roles' claim), get them from db.
ValidateUser bool `json:"validateUser"`
// Specifies which issuer should be accepted when validating external JWTs ('iss' claim)
TrustedIssuer string `json:"trustedIssuer"`
// Should an non-existent user be added to the DB based on the information in the token
SyncUserOnLogin bool `json:"syncUserOnLogin"`
// Should an existent user be updated in the DB based on the information in the token
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
}
type JWTAuthenticator struct {
publicKey ed25519.PublicKey
privateKey ed25519.PrivateKey
@@ -28,17 +49,17 @@ type JWTAuthenticator struct {
func (ja *JWTAuthenticator) Init() error {
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
if pubKey == "" || privKey == "" {
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
} else {
bytes, err := base64.StdEncoding.DecodeString(pubKey)
if err != nil {
log.Warn("Could not decode JWT public key")
cclog.Warn("Could not decode JWT public key")
return err
}
ja.publicKey = ed25519.PublicKey(bytes)
bytes, err = base64.StdEncoding.DecodeString(privKey)
if err != nil {
log.Warn("Could not decode JWT private key")
cclog.Warn("Could not decode JWT private key")
return err
}
ja.privateKey = ed25519.PrivateKey(bytes)
@@ -49,8 +70,8 @@ func (ja *JWTAuthenticator) Init() error {
func (ja *JWTAuthenticator) AuthViaJWT(
rw http.ResponseWriter,
r *http.Request) (*schema.User, error) {
r *http.Request,
) (*schema.User, error) {
rawtoken := r.Header.Get("X-Auth-Token")
if rawtoken == "" {
rawtoken = r.Header.Get("Authorization")
@@ -62,7 +83,7 @@ func (ja *JWTAuthenticator) AuthViaJWT(
return nil, nil
}
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
if t.Method != jwt.SigningMethodEdDSA {
return nil, errors.New("only Ed25519/EdDSA supported")
}
@@ -70,54 +91,35 @@ func (ja *JWTAuthenticator) AuthViaJWT(
return ja.publicKey, nil
})
if err != nil {
log.Warn("Error while parsing JWT token")
cclog.Warn("Error while parsing JWT token")
return nil, err
}
if err := token.Claims.Valid(); err != nil {
log.Warn("jwt token claims are not valid")
return nil, err
if !token.Valid {
cclog.Warn("jwt token claims are not valid")
return nil, errors.New("jwt token claims are not valid")
}
// Token is valid, extract payload
claims := token.Claims.(jwt.MapClaims)
sub, _ := claims["sub"].(string)
var roles []string
// Validate user + roles from JWT against database?
if config.Keys.JwtConfig.ValidateUser {
ur := repository.GetUserRepository()
user, err := ur.GetUser(sub)
// Deny any logins for unknown usernames
if err != nil {
log.Warn("Could not find user from JWT in internal database.")
return nil, errors.New("unknown user")
}
// Take user roles from database instead of trusting the JWT
roles = user.Roles
} else {
// Extract roles from JWT (if present)
if rawroles, ok := claims["roles"].([]interface{}); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
roles = append(roles, r)
}
}
}
// Use shared helper to get user from JWT claims
var user *schema.User
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthToken, -1)
if err != nil {
return nil, err
}
return &schema.User{
Username: sub,
Roles: roles,
AuthType: schema.AuthToken,
AuthSource: -1,
}, nil
// If not validating user, we only get roles from JWT (no projects for this auth method)
if !Keys.JwtConfig.ValidateUser {
user.Roles = extractRolesFromClaims(claims, false)
user.Projects = nil // Standard JWT auth doesn't include projects
}
return user, nil
}
// Generate a new JWT that can be used for authentication
// ProvideJWT generates a new JWT that can be used for authentication
func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
if ja.privateKey == nil {
return "", errors.New("environment variable 'JWT_PRIVATE_KEY' not set")
}
@@ -128,8 +130,8 @@ func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
"roles": user.Roles,
"iat": now.Unix(),
}
if config.Keys.JwtConfig.MaxAge != "" {
d, err := time.ParseDuration(config.Keys.JwtConfig.MaxAge)
if Keys.JwtConfig.MaxAge != "" {
d, err := time.ParseDuration(Keys.JwtConfig.MaxAge)
if err != nil {
return "", errors.New("cannot parse max-age config key")
}

View File

@@ -1,22 +1,20 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"crypto/ed25519"
"encoding/base64"
"errors"
"fmt"
"net/http"
"os"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/golang-jwt/jwt/v4"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/golang-jwt/jwt/v5"
)
type JWTCookieSessionAuthenticator struct {
@@ -30,18 +28,18 @@ var _ Authenticator = (*JWTCookieSessionAuthenticator)(nil)
func (ja *JWTCookieSessionAuthenticator) Init() error {
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
if pubKey == "" || privKey == "" {
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
return errors.New("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
} else {
bytes, err := base64.StdEncoding.DecodeString(pubKey)
if err != nil {
log.Warn("Could not decode JWT public key")
cclog.Warn("Could not decode JWT public key")
return err
}
ja.publicKey = ed25519.PublicKey(bytes)
bytes, err = base64.StdEncoding.DecodeString(privKey)
if err != nil {
log.Warn("Could not decode JWT private key")
cclog.Warn("Could not decode JWT private key")
return err
}
ja.privateKey = ed25519.PrivateKey(bytes)
@@ -52,36 +50,35 @@ func (ja *JWTCookieSessionAuthenticator) Init() error {
if keyFound && pubKeyCrossLogin != "" {
bytes, err := base64.StdEncoding.DecodeString(pubKeyCrossLogin)
if err != nil {
log.Warn("Could not decode cross login JWT public key")
cclog.Warn("Could not decode cross login JWT public key")
return err
}
ja.publicKeyCrossLogin = ed25519.PublicKey(bytes)
} else {
ja.publicKeyCrossLogin = nil
log.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
cclog.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
return errors.New("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
}
jc := config.Keys.JwtConfig
// Warn if other necessary settings are not configured
if jc != nil {
if jc.CookieName == "" {
log.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
if Keys.JwtConfig != nil {
if Keys.JwtConfig.CookieName == "" {
cclog.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
return errors.New("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
}
if !jc.ValidateUser {
log.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
if !Keys.JwtConfig.ValidateUser {
cclog.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
}
if jc.TrustedIssuer == "" {
log.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
if Keys.JwtConfig.TrustedIssuer == "" {
cclog.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
return errors.New("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
}
} else {
log.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
cclog.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
return errors.New("config for JWTs not configured (cross login via JWT cookie will fail)")
}
log.Info("JWT Cookie Session authenticator successfully registered")
cclog.Info("JWT Cookie Session authenticator successfully registered")
return nil
}
@@ -89,9 +86,9 @@ func (ja *JWTCookieSessionAuthenticator) CanLogin(
user *schema.User,
username string,
rw http.ResponseWriter,
r *http.Request) (*schema.User, bool) {
jc := config.Keys.JwtConfig
r *http.Request,
) (*schema.User, bool) {
jc := Keys.JwtConfig
cookieName := ""
if jc.CookieName != "" {
cookieName = jc.CookieName
@@ -112,9 +109,9 @@ func (ja *JWTCookieSessionAuthenticator) CanLogin(
func (ja *JWTCookieSessionAuthenticator) Login(
user *schema.User,
rw http.ResponseWriter,
r *http.Request) (*schema.User, error) {
jc := config.Keys.JwtConfig
r *http.Request,
) (*schema.User, error) {
jc := Keys.JwtConfig
jwtCookie, err := r.Cookie(jc.CookieName)
var rawtoken string
@@ -122,7 +119,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
rawtoken = jwtCookie.Value
}
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
if t.Method != jwt.SigningMethodEdDSA {
return nil, errors.New("only Ed25519/EdDSA supported")
}
@@ -139,52 +136,26 @@ func (ja *JWTCookieSessionAuthenticator) Login(
return ja.publicKey, nil
})
if err != nil {
log.Warn("JWT cookie session: error while parsing token")
cclog.Warn("JWT cookie session: error while parsing token")
return nil, err
}
// Check token validity and extract paypload
if err := token.Claims.Valid(); err != nil {
log.Warn("jwt token claims are not valid")
return nil, err
if !token.Valid {
cclog.Warn("jwt token claims are not valid")
return nil, errors.New("jwt token claims are not valid")
}
claims := token.Claims.(jwt.MapClaims)
sub, _ := claims["sub"].(string)
var name string
if wrap, ok := claims["name"].(map[string]interface{}); ok {
if vals, ok := wrap["values"].([]interface{}); ok {
if len(vals) != 0 {
name = fmt.Sprintf("%v", vals[0])
for i := 1; i < len(vals); i++ {
name += fmt.Sprintf(" %v", vals[i])
}
}
}
// Use shared helper to get user from JWT claims
user, err = getUserFromJWT(claims, jc.ValidateUser, schema.AuthSession, schema.AuthViaToken)
if err != nil {
return nil, err
}
var roles []string
if jc.ValidateUser {
// Deny any logins for unknown usernames
if user == nil {
log.Warn("Could not find user from JWT in internal database.")
return nil, errors.New("unknown user")
}
// Take user roles from database instead of trusting the JWT
roles = user.Roles
} else {
// Extract roles from JWT (if present)
if rawroles, ok := claims["roles"].([]interface{}); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
roles = append(roles, r)
}
}
}
// Sync or update user if configured
if !jc.ValidateUser && (jc.SyncUserOnLogin || jc.UpdateUserOnLogin) {
handleTokenUser(user)
}
// (Ask browser to) Delete JWT cookie
@@ -197,23 +168,5 @@ func (ja *JWTCookieSessionAuthenticator) Login(
}
http.SetCookie(rw, deletedCookie)
if user == nil {
projects := make([]string, 0)
user = &schema.User{
Username: sub,
Name: name,
Roles: roles,
Projects: projects,
AuthType: schema.AuthSession,
AuthSource: schema.AuthViaToken,
}
if jc.SyncUserOnLogin {
if err := repository.GetUserRepository().AddUser(user); err != nil {
log.Errorf("Error while adding user '%s' to DB", user.Username)
}
}
}
return user, nil
}

136
internal/auth/jwtHelpers.go Normal file
View File

@@ -0,0 +1,136 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"database/sql"
"errors"
"fmt"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/golang-jwt/jwt/v5"
)
// extractStringFromClaims extracts a string value from JWT claims
func extractStringFromClaims(claims jwt.MapClaims, key string) string {
if val, ok := claims[key].(string); ok {
return val
}
return ""
}
// extractRolesFromClaims extracts roles from JWT claims
// If validateRoles is true, only valid roles are returned
func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
var roles []string
if rawroles, ok := claims["roles"].([]any); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
if validateRoles {
if schema.IsValidRole(r) {
roles = append(roles, r)
}
} else {
roles = append(roles, r)
}
}
}
}
return roles
}
// extractProjectsFromClaims extracts projects from JWT claims
func extractProjectsFromClaims(claims jwt.MapClaims) []string {
projects := make([]string, 0)
if rawprojs, ok := claims["projects"].([]any); ok {
for _, pp := range rawprojs {
if p, ok := pp.(string); ok {
projects = append(projects, p)
}
}
} else if rawprojs, ok := claims["projects"]; ok {
if projSlice, ok := rawprojs.([]string); ok {
projects = append(projects, projSlice...)
}
}
return projects
}
// extractNameFromClaims extracts name from JWT claims
// Handles both simple string and complex nested structure
func extractNameFromClaims(claims jwt.MapClaims) string {
// Try simple string first
if name, ok := claims["name"].(string); ok {
return name
}
// Try nested structure: {name: {values: [...]}}
if wrap, ok := claims["name"].(map[string]any); ok {
if vals, ok := wrap["values"].([]any); ok {
if len(vals) == 0 {
return ""
}
name := fmt.Sprintf("%v", vals[0])
for i := 1; i < len(vals); i++ {
name += fmt.Sprintf(" %v", vals[i])
}
return name
}
}
return ""
}
// getUserFromJWT creates or retrieves a user based on JWT claims
// If validateUser is true, the user must exist in the database
// Otherwise, a new user object is created from claims
// authSource should be a schema.AuthSource constant (like schema.AuthViaToken)
func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.AuthType, authSource schema.AuthSource) (*schema.User, error) {
sub := extractStringFromClaims(claims, "sub")
if sub == "" {
return nil, errors.New("missing 'sub' claim in JWT")
}
if validateUser {
// Validate user against database
ur := repository.GetUserRepository()
user, err := ur.GetUser(sub)
if err != nil && err != sql.ErrNoRows {
cclog.Errorf("Error while loading user '%v': %v", sub, err)
return nil, fmt.Errorf("database error: %w", err)
}
// Deny any logins for unknown usernames
if user == nil || err == sql.ErrNoRows {
cclog.Warn("Could not find user from JWT in internal database.")
return nil, errors.New("unknown user")
}
// Return database user (with database roles)
return user, nil
}
// Create user from JWT claims
name := extractNameFromClaims(claims)
roles := extractRolesFromClaims(claims, true) // Validate roles
projects := extractProjectsFromClaims(claims)
return &schema.User{
Username: sub,
Name: name,
Roles: roles,
Projects: projects,
AuthType: authType,
AuthSource: authSource,
}, nil
}

View File

@@ -0,0 +1,281 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"testing"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/golang-jwt/jwt/v5"
)
// TestExtractStringFromClaims tests extracting string values from JWT claims
func TestExtractStringFromClaims(t *testing.T) {
claims := jwt.MapClaims{
"sub": "testuser",
"email": "test@example.com",
"age": 25, // not a string
}
tests := []struct {
name string
key string
expected string
}{
{"Existing string", "sub", "testuser"},
{"Another string", "email", "test@example.com"},
{"Non-existent key", "missing", ""},
{"Non-string value", "age", ""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractStringFromClaims(claims, tt.key)
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
})
}
}
// TestExtractRolesFromClaims tests role extraction and validation
func TestExtractRolesFromClaims(t *testing.T) {
tests := []struct {
name string
claims jwt.MapClaims
validateRoles bool
expected []string
}{
{
name: "Valid roles without validation",
claims: jwt.MapClaims{
"roles": []any{"admin", "user", "invalid_role"},
},
validateRoles: false,
expected: []string{"admin", "user", "invalid_role"},
},
{
name: "Valid roles with validation",
claims: jwt.MapClaims{
"roles": []any{"admin", "user", "api"},
},
validateRoles: true,
expected: []string{"admin", "user", "api"},
},
{
name: "Invalid roles with validation",
claims: jwt.MapClaims{
"roles": []any{"invalid_role", "fake_role"},
},
validateRoles: true,
expected: []string{}, // Should filter out invalid roles
},
{
name: "No roles claim",
claims: jwt.MapClaims{},
validateRoles: false,
expected: []string{},
},
{
name: "Non-array roles",
claims: jwt.MapClaims{
"roles": "admin",
},
validateRoles: false,
expected: []string{},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractRolesFromClaims(tt.claims, tt.validateRoles)
if len(result) != len(tt.expected) {
t.Errorf("Expected %d roles, got %d", len(tt.expected), len(result))
return
}
for i, role := range result {
if i >= len(tt.expected) || role != tt.expected[i] {
t.Errorf("Expected role %s at position %d, got %s", tt.expected[i], i, role)
}
}
})
}
}
// TestExtractProjectsFromClaims tests project extraction from claims
func TestExtractProjectsFromClaims(t *testing.T) {
tests := []struct {
name string
claims jwt.MapClaims
expected []string
}{
{
name: "Projects as array of interfaces",
claims: jwt.MapClaims{
"projects": []any{"project1", "project2", "project3"},
},
expected: []string{"project1", "project2", "project3"},
},
{
name: "Projects as string array",
claims: jwt.MapClaims{
"projects": []string{"projectA", "projectB"},
},
expected: []string{"projectA", "projectB"},
},
{
name: "No projects claim",
claims: jwt.MapClaims{},
expected: []string{},
},
{
name: "Mixed types in projects array",
claims: jwt.MapClaims{
"projects": []any{"project1", 123, "project2"},
},
expected: []string{"project1", "project2"}, // Should skip non-strings
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractProjectsFromClaims(tt.claims)
if len(result) != len(tt.expected) {
t.Errorf("Expected %d projects, got %d", len(tt.expected), len(result))
return
}
for i, project := range result {
if i >= len(tt.expected) || project != tt.expected[i] {
t.Errorf("Expected project %s at position %d, got %s", tt.expected[i], i, project)
}
}
})
}
}
// TestExtractNameFromClaims tests name extraction from various formats
func TestExtractNameFromClaims(t *testing.T) {
tests := []struct {
name string
claims jwt.MapClaims
expected string
}{
{
name: "Simple string name",
claims: jwt.MapClaims{
"name": "John Doe",
},
expected: "John Doe",
},
{
name: "Nested name structure",
claims: jwt.MapClaims{
"name": map[string]any{
"values": []any{"John", "Doe"},
},
},
expected: "John Doe",
},
{
name: "Nested name with single value",
claims: jwt.MapClaims{
"name": map[string]any{
"values": []any{"Alice"},
},
},
expected: "Alice",
},
{
name: "No name claim",
claims: jwt.MapClaims{},
expected: "",
},
{
name: "Empty nested values",
claims: jwt.MapClaims{
"name": map[string]any{
"values": []any{},
},
},
expected: "",
},
{
name: "Nested with non-string values",
claims: jwt.MapClaims{
"name": map[string]any{
"values": []any{123, "Smith"},
},
},
expected: "123 Smith", // Should convert to string
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractNameFromClaims(tt.claims)
if result != tt.expected {
t.Errorf("Expected '%s', got '%s'", tt.expected, result)
}
})
}
}
// TestGetUserFromJWT_NoValidation tests getUserFromJWT without database validation
func TestGetUserFromJWT_NoValidation(t *testing.T) {
claims := jwt.MapClaims{
"sub": "testuser",
"name": "Test User",
"roles": []any{"user", "admin"},
"projects": []any{"project1", "project2"},
}
user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if user.Username != "testuser" {
t.Errorf("Expected username 'testuser', got '%s'", user.Username)
}
if user.Name != "Test User" {
t.Errorf("Expected name 'Test User', got '%s'", user.Name)
}
if len(user.Roles) != 2 {
t.Errorf("Expected 2 roles, got %d", len(user.Roles))
}
if len(user.Projects) != 2 {
t.Errorf("Expected 2 projects, got %d", len(user.Projects))
}
if user.AuthType != schema.AuthToken {
t.Errorf("Expected AuthType %v, got %v", schema.AuthToken, user.AuthType)
}
}
// TestGetUserFromJWT_MissingSub tests error when sub claim is missing
func TestGetUserFromJWT_MissingSub(t *testing.T) {
claims := jwt.MapClaims{
"name": "Test User",
}
_, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
if err == nil {
t.Error("Expected error for missing sub claim")
}
if err.Error() != "missing 'sub' claim in JWT" {
t.Errorf("Expected specific error message, got: %v", err)
}
}

View File

@@ -1,7 +1,8 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
@@ -12,11 +13,9 @@ import (
"os"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/golang-jwt/jwt/v4"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/golang-jwt/jwt/v5"
)
type JWTSessionAuthenticator struct {
@@ -29,13 +28,13 @@ func (ja *JWTSessionAuthenticator) Init() error {
if pubKey := os.Getenv("CROSS_LOGIN_JWT_HS512_KEY"); pubKey != "" {
bytes, err := base64.StdEncoding.DecodeString(pubKey)
if err != nil {
log.Warn("Could not decode cross login JWT HS512 key")
cclog.Warn("Could not decode cross login JWT HS512 key")
return err
}
ja.loginTokenKey = bytes
}
log.Info("JWT Session authenticator successfully registered")
cclog.Info("JWT Session authenticator successfully registered")
return nil
}
@@ -43,8 +42,8 @@ func (ja *JWTSessionAuthenticator) CanLogin(
user *schema.User,
username string,
rw http.ResponseWriter,
r *http.Request) (*schema.User, bool) {
r *http.Request,
) (*schema.User, bool) {
return user, r.Header.Get("Authorization") != "" ||
r.URL.Query().Get("login-token") != ""
}
@@ -52,98 +51,40 @@ func (ja *JWTSessionAuthenticator) CanLogin(
func (ja *JWTSessionAuthenticator) Login(
user *schema.User,
rw http.ResponseWriter,
r *http.Request) (*schema.User, error) {
r *http.Request,
) (*schema.User, error) {
rawtoken := strings.TrimPrefix(r.Header.Get("Authorization"), "Bearer ")
if rawtoken == "" {
rawtoken = r.URL.Query().Get("login-token")
}
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
if t.Method == jwt.SigningMethodHS256 || t.Method == jwt.SigningMethodHS512 {
return ja.loginTokenKey, nil
}
return nil, fmt.Errorf("unkown signing method for login token: %s (known: HS256, HS512, EdDSA)", t.Method.Alg())
})
if err != nil {
log.Warn("Error while parsing jwt token")
cclog.Warn("Error while parsing jwt token")
return nil, err
}
if err = token.Claims.Valid(); err != nil {
log.Warn("jwt token claims are not valid")
return nil, err
if !token.Valid {
cclog.Warn("jwt token claims are not valid")
return nil, errors.New("jwt token claims are not valid")
}
claims := token.Claims.(jwt.MapClaims)
sub, _ := claims["sub"].(string)
var name string
if wrap, ok := claims["name"].(map[string]interface{}); ok {
if vals, ok := wrap["values"].([]interface{}); ok {
if len(vals) != 0 {
name = fmt.Sprintf("%v", vals[0])
for i := 1; i < len(vals); i++ {
name += fmt.Sprintf(" %v", vals[i])
}
}
}
// Use shared helper to get user from JWT claims
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthSession, schema.AuthViaToken)
if err != nil {
return nil, err
}
var roles []string
if config.Keys.JwtConfig.ValidateUser {
// Deny any logins for unknown usernames
if user == nil {
log.Warn("Could not find user from JWT in internal database.")
return nil, errors.New("unknown user")
}
// Take user roles from database instead of trusting the JWT
roles = user.Roles
} else {
// Extract roles from JWT (if present)
if rawroles, ok := claims["roles"].([]interface{}); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
if schema.IsValidRole(r) {
roles = append(roles, r)
}
}
}
}
}
projects := make([]string, 0)
// Java/Grails Issued Token
// if rawprojs, ok := claims["projects"].([]interface{}); ok {
// for _, pp := range rawprojs {
// if p, ok := pp.(string); ok {
// projects = append(projects, p)
// }
// }
// } else if rawprojs, ok := claims["projects"]; ok {
// for _, p := range rawprojs.([]string) {
// projects = append(projects, p)
// }
// }
if user == nil {
user = &schema.User{
Username: sub,
Name: name,
Roles: roles,
Projects: projects,
AuthType: schema.AuthSession,
AuthSource: schema.AuthViaToken,
}
if config.Keys.JwtConfig.SyncUserOnLogin {
if err := repository.GetUserRepository().AddUser(user); err != nil {
log.Errorf("Error while adding user '%s' to DB", user.Username)
}
}
// Sync or update user if configured
if !Keys.JwtConfig.ValidateUser && (Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin) {
handleTokenUser(user)
}
return user, nil

View File

@@ -1,7 +1,8 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
@@ -10,18 +11,30 @@ import (
"net/http"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/go-ldap/ldap/v3"
)
type LdapConfig struct {
URL string `json:"url"`
UserBase string `json:"user_base"`
SearchDN string `json:"search_dn"`
UserBind string `json:"user_bind"`
UserFilter string `json:"user_filter"`
UserAttr string `json:"username_attr"`
SyncInterval string `json:"sync_interval"` // Parsed using time.ParseDuration.
SyncDelOldUsers bool `json:"sync_del_old_users"`
// Should an non-existent user be added to the DB if user exists in ldap directory
SyncUserOnLogin bool `json:"syncUserOnLogin"`
}
type LdapAuthenticator struct {
syncPassword string
UserAttr string
UserAttr string
}
var _ Authenticator = (*LdapAuthenticator)(nil)
@@ -29,40 +42,11 @@ var _ Authenticator = (*LdapAuthenticator)(nil)
func (la *LdapAuthenticator) Init() error {
la.syncPassword = os.Getenv("LDAP_ADMIN_PASSWORD")
if la.syncPassword == "" {
log.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
cclog.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
}
lc := config.Keys.LdapConfig
if lc.SyncInterval != "" {
interval, err := time.ParseDuration(lc.SyncInterval)
if err != nil {
log.Warnf("Could not parse duration for sync interval: %v",
lc.SyncInterval)
return err
}
if interval == 0 {
log.Info("Sync interval is zero")
return nil
}
go func() {
ticker := time.NewTicker(interval)
for t := range ticker.C {
log.Printf("sync started at %s", t.Format(time.RFC3339))
if err := la.Sync(); err != nil {
log.Errorf("sync failed: %s", err.Error())
}
log.Print("sync done")
}
}()
} else {
log.Info("LDAP configuration key sync_interval invalid")
}
if lc.UserAttr != "" {
la.UserAttr = lc.UserAttr
if Keys.LdapConfig.UserAttr != "" {
la.UserAttr = Keys.LdapConfig.UserAttr
} else {
la.UserAttr = "gecos"
}
@@ -74,9 +58,9 @@ func (la *LdapAuthenticator) CanLogin(
user *schema.User,
username string,
rw http.ResponseWriter,
r *http.Request) (*schema.User, bool) {
lc := config.Keys.LdapConfig
r *http.Request,
) (*schema.User, bool) {
lc := Keys.LdapConfig
if user != nil {
if user.AuthSource == schema.AuthViaLDAP {
@@ -86,7 +70,8 @@ func (la *LdapAuthenticator) CanLogin(
if lc.SyncUserOnLogin {
l, err := la.getLdapConnection(true)
if err != nil {
log.Error("LDAP connection error")
cclog.Error("LDAP connection error")
return nil, false
}
defer l.Close()
@@ -99,12 +84,12 @@ func (la *LdapAuthenticator) CanLogin(
sr, err := l.Search(searchRequest)
if err != nil {
log.Warn(err)
cclog.Warn(err)
return nil, false
}
if len(sr.Entries) != 1 {
log.Warn("LDAP: User does not exist or too many entries returned")
cclog.Warn("LDAP: User does not exist or too many entries returned")
return nil, false
}
@@ -124,7 +109,7 @@ func (la *LdapAuthenticator) CanLogin(
}
if err := repository.GetUserRepository().AddUser(user); err != nil {
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
return nil, false
}
@@ -138,18 +123,18 @@ func (la *LdapAuthenticator) CanLogin(
func (la *LdapAuthenticator) Login(
user *schema.User,
rw http.ResponseWriter,
r *http.Request) (*schema.User, error) {
r *http.Request,
) (*schema.User, error) {
l, err := la.getLdapConnection(false)
if err != nil {
log.Warn("Error while getting ldap connection")
cclog.Warn("Error while getting ldap connection")
return nil, err
}
defer l.Close()
userDn := strings.Replace(config.Keys.LdapConfig.UserBind, "{username}", user.Username, -1)
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username)
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
log.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
user.Username, err)
return nil, fmt.Errorf("Authentication failed")
}
@@ -158,11 +143,11 @@ func (la *LdapAuthenticator) Login(
}
func (la *LdapAuthenticator) Sync() error {
const IN_DB int = 1
const IN_LDAP int = 2
const IN_BOTH int = 3
const InDB int = 1
const InLdap int = 2
const InBoth int = 3
ur := repository.GetUserRepository()
lc := config.Keys.LdapConfig
lc := Keys.LdapConfig
users := map[string]int{}
usernames, err := ur.GetLdapUsernames()
@@ -171,12 +156,12 @@ func (la *LdapAuthenticator) Sync() error {
}
for _, username := range usernames {
users[username] = IN_DB
users[username] = InDB
}
l, err := la.getLdapConnection(true)
if err != nil {
log.Error("LDAP connection error")
cclog.Error("LDAP connection error")
return err
}
defer l.Close()
@@ -187,7 +172,7 @@ func (la *LdapAuthenticator) Sync() error {
lc.UserFilter,
[]string{"dn", "uid", la.UserAttr}, nil))
if err != nil {
log.Warn("LDAP search error")
cclog.Warn("LDAP search error")
return err
}
@@ -200,18 +185,18 @@ func (la *LdapAuthenticator) Sync() error {
_, ok := users[username]
if !ok {
users[username] = IN_LDAP
users[username] = InLdap
newnames[username] = entry.GetAttributeValue(la.UserAttr)
} else {
users[username] = IN_BOTH
users[username] = InBoth
}
}
for username, where := range users {
if where == IN_DB && lc.SyncDelOldUsers {
if where == InDB && lc.SyncDelOldUsers {
ur.DelUser(username)
log.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
} else if where == IN_LDAP {
cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
} else if where == InLdap {
name := newnames[username]
var roles []string
@@ -226,9 +211,9 @@ func (la *LdapAuthenticator) Sync() error {
AuthSource: schema.AuthViaLDAP,
}
log.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
cclog.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
if err := ur.AddUser(user); err != nil {
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
return err
}
}
@@ -238,18 +223,17 @@ func (la *LdapAuthenticator) Sync() error {
}
func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
lc := config.Keys.LdapConfig
conn, err := ldap.DialURL(lc.Url)
lc := Keys.LdapConfig
conn, err := ldap.DialURL(lc.URL)
if err != nil {
log.Warn("LDAP URL dial failed")
cclog.Warn("LDAP URL dial failed")
return nil, err
}
if admin {
if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {
conn.Close()
log.Warn("LDAP connection bind failed")
cclog.Warn("LDAP connection bind failed")
return nil, err
}
}

View File

@@ -1,15 +1,16 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"fmt"
"net/http"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"golang.org/x/crypto/bcrypt"
)
@@ -27,19 +28,19 @@ func (la *LocalAuthenticator) CanLogin(
user *schema.User,
username string,
rw http.ResponseWriter,
r *http.Request) (*schema.User, bool) {
r *http.Request,
) (*schema.User, bool) {
return user, user != nil && user.AuthSource == schema.AuthViaLocalPassword
}
func (la *LocalAuthenticator) Login(
user *schema.User,
rw http.ResponseWriter,
r *http.Request) (*schema.User, error) {
r *http.Request,
) (*schema.User, error) {
if e := bcrypt.CompareHashAndPassword([]byte(user.Password),
[]byte(r.FormValue("password"))); e != nil {
log.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
cclog.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
return nil, fmt.Errorf("Authentication failed")
}

212
internal/auth/oidc.go Normal file
View File

@@ -0,0 +1,212 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"context"
"crypto/rand"
"encoding/base64"
"io"
"net/http"
"os"
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/coreos/go-oidc/v3/oidc"
"github.com/gorilla/mux"
"golang.org/x/oauth2"
)
type OpenIDConfig struct {
Provider string `json:"provider"`
SyncUserOnLogin bool `json:"syncUserOnLogin"`
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
}
type OIDC struct {
client *oauth2.Config
provider *oidc.Provider
authentication *Authentication
clientID string
}
func randString(nByte int) (string, error) {
b := make([]byte, nByte)
if _, err := io.ReadFull(rand.Reader, b); err != nil {
return "", err
}
return base64.RawURLEncoding.EncodeToString(b), nil
}
func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value string) {
c := &http.Cookie{
Name: name,
Value: value,
MaxAge: int(time.Hour.Seconds()),
Secure: r.TLS != nil,
HttpOnly: true,
}
http.SetCookie(w, c)
}
// NewOIDC creates a new OIDC authenticator with the configured provider
func NewOIDC(a *Authentication) *OIDC {
// Use context with timeout for provider initialization
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
provider, err := oidc.NewProvider(ctx, Keys.OpenIDConfig.Provider)
if err != nil {
cclog.Fatal(err)
}
clientID := os.Getenv("OID_CLIENT_ID")
if clientID == "" {
cclog.Warn("environment variable 'OID_CLIENT_ID' not set (Open ID connect auth will not work)")
}
clientSecret := os.Getenv("OID_CLIENT_SECRET")
if clientSecret == "" {
cclog.Warn("environment variable 'OID_CLIENT_SECRET' not set (Open ID connect auth will not work)")
}
client := &oauth2.Config{
ClientID: clientID,
ClientSecret: clientSecret,
Endpoint: provider.Endpoint(),
RedirectURL: "oidc-callback",
Scopes: []string{oidc.ScopeOpenID, "profile", "email"},
}
oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a}
return oa
}
func (oa *OIDC) RegisterEndpoints(r *mux.Router) {
r.HandleFunc("/oidc-login", oa.OAuth2Login)
r.HandleFunc("/oidc-callback", oa.OAuth2Callback)
}
func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
c, err := r.Cookie("state")
if err != nil {
http.Error(rw, "state cookie not found", http.StatusBadRequest)
return
}
state := c.Value
c, err = r.Cookie("verifier")
if err != nil {
http.Error(rw, "verifier cookie not found", http.StatusBadRequest)
return
}
codeVerifier := c.Value
_ = r.ParseForm()
if r.Form.Get("state") != state {
http.Error(rw, "State invalid", http.StatusBadRequest)
return
}
code := r.Form.Get("code")
if code == "" {
http.Error(rw, "Code not found", http.StatusBadRequest)
return
}
// Exchange authorization code for token with timeout
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier))
if err != nil {
http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError)
return
}
// Get user info from OIDC provider with same timeout
userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token))
if err != nil {
http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError)
return
}
// // Extract the ID Token from OAuth2 token.
// rawIDToken, ok := token.Extra("id_token").(string)
// if !ok {
// http.Error(rw, "Cannot access idToken", http.StatusInternalServerError)
// }
//
// verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
// // Parse and verify ID Token payload.
// idToken, err := verifier.Verify(context.Background(), rawIDToken)
// if err != nil {
// http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError)
// }
projects := make([]string, 0)
// Extract custom claims
var claims struct {
Username string `json:"preferred_username"`
Name string `json:"name"`
Profile struct {
Client struct {
Roles []string `json:"roles"`
} `json:"clustercockpit"`
} `json:"resource_access"`
}
if err := userInfo.Claims(&claims); err != nil {
http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError)
}
var roles []string
for _, r := range claims.Profile.Client.Roles {
switch r {
case "user":
roles = append(roles, schema.GetRoleString(schema.RoleUser))
case "admin":
roles = append(roles, schema.GetRoleString(schema.RoleAdmin))
}
}
if len(roles) == 0 {
roles = append(roles, schema.GetRoleString(schema.RoleUser))
}
user := &schema.User{
Username: claims.Username,
Name: claims.Name,
Roles: roles,
Projects: projects,
AuthSource: schema.AuthViaOIDC,
}
if Keys.OpenIDConfig.SyncUserOnLogin || Keys.OpenIDConfig.UpdateUserOnLogin {
handleOIDCUser(user)
}
oa.authentication.SaveSession(rw, r, user)
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user)
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx))
}
func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
state, err := randString(16)
if err != nil {
http.Error(rw, "Internal error", http.StatusInternalServerError)
return
}
setCallbackCookie(rw, r, "state", state)
// use PKCE to protect against CSRF attacks
codeVerifier := oauth2.GenerateVerifier()
setCallbackCookie(rw, r, "verifier", codeVerifier)
// Redirect user to consent page to ask for permission
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier))
http.Redirect(rw, r, url, http.StatusFound)
}

96
internal/auth/schema.go Normal file
View File

@@ -0,0 +1,96 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
var configSchema = `
{
"jwts": {
"description": "For JWT token authentication.",
"type": "object",
"properties": {
"max-age": {
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
"type": "string"
},
"cookieName": {
"description": "Cookie that should be checked for a JWT token.",
"type": "string"
},
"validateUser": {
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
"type": "boolean"
},
"trustedIssuer": {
"description": "Issuer that should be accepted when validating external JWTs ",
"type": "string"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
"type": "boolean"
}
},
"required": ["max-age"]
},
"oidc": {
"provider": {
"description": "",
"type": "string"
},
"syncUserOnLogin": {
"description": "",
"type": "boolean"
},
"updateUserOnLogin": {
"description": "",
"type": "boolean"
},
"required": ["provider"]
},
"ldap": {
"description": "For LDAP Authentication and user synchronisation.",
"type": "object",
"properties": {
"url": {
"description": "URL of LDAP directory server.",
"type": "string"
},
"user_base": {
"description": "Base DN of user tree root.",
"type": "string"
},
"search_dn": {
"description": "DN for authenticating LDAP admin account with general read rights.",
"type": "string"
},
"user_bind": {
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
"type": "string"
},
"user_filter": {
"description": "Filter to extract users for syncing.",
"type": "string"
},
"username_attr": {
"description": "Attribute with full username. Default: gecos",
"type": "string"
},
"sync_interval": {
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
"type": "string"
},
"sync_del_old_users": {
"description": "Delete obsolete users in database.",
"type": "boolean"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
"type": "boolean"
}
},
"required": ["url", "user_base", "search_dn", "user_bind", "user_filter"]
},
"required": ["jwts"]
}`

View File

@@ -1,71 +1,160 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package config implements the program configuration data structures, validation and parsing
package config
import (
"bytes"
"encoding/json"
"log"
"os"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/resampler"
)
var Keys schema.ProgramConfig = schema.ProgramConfig{
type ProgramConfig struct {
// Address where the http (or https) server will listen on (for example: 'localhost:80').
Addr string `json:"addr"`
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
APIAllowedIPs []string `json:"apiAllowedIPs"`
APISubjects *NATSConfig `json:"apiSubjects"`
// Drop root permissions once .env was read and the port was taken.
User string `json:"user"`
Group string `json:"group"`
// Disable authentication (for everything: API, Web-UI, ...)
DisableAuthentication bool `json:"disable-authentication"`
// If `embed-static-files` is true (default), the frontend files are directly
// embeded into the go binary and expected to be in web/frontend. Only if
// it is false the files in `static-files` are served instead.
EmbedStaticFiles bool `json:"embed-static-files"`
StaticFiles string `json:"static-files"`
// Database driver - only 'sqlite3' is supported
DBDriver string `json:"db-driver"`
// Path to SQLite database file
DB string `json:"db"`
// Keep all metric data in the metric data repositories,
// do not write to the job-archive.
DisableArchive bool `json:"disable-archive"`
EnableJobTaggers bool `json:"enable-job-taggers"`
// Validate json input against schema
Validate bool `json:"validate"`
// If 0 or empty, the session does not expire!
SessionMaxAge string `json:"session-max-age"`
// If both those options are not empty, use HTTPS using those certificates.
HTTPSCertFile string `json:"https-cert-file"`
HTTPSKeyFile string `json:"https-key-file"`
// If not the empty string and `addr` does not end in ":80",
// redirect every request incoming at port 80 to that url.
RedirectHTTPTo string `json:"redirect-http-to"`
// Where to store MachineState files
MachineStateDir string `json:"machine-state-dir"`
// If not zero, automatically mark jobs as stopped running X seconds longer than their walltime.
StopJobsExceedingWalltime int `json:"stop-jobs-exceeding-walltime"`
// Defines time X in seconds in which jobs are considered to be "short" and will be filtered in specific views.
ShortRunningJobsDuration int `json:"short-running-jobs-duration"`
// Energy Mix CO2 Emission Constant [g/kWh]
// If entered, displays estimated CO2 emission for job based on jobs totalEnergy
EmissionConstant int `json:"emission-constant"`
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"resampling"`
// Global upstream metric repository configuration for metric pull workers
UpstreamMetricRepository *json.RawMessage `json:"upstreamMetricRepository,omitempty"`
}
type ResampleConfig struct {
// Minimum number of points to trigger resampling of data
MinimumPoints int `json:"minimumPoints"`
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
Resolutions []int `json:"resolutions"`
// Trigger next zoom level at less than this many visible datapoints
Trigger int `json:"trigger"`
}
type NATSConfig struct {
SubjectJobStart string `json:"subjectJobStart"`
SubjectJobStop string `json:"subjectJobStop"`
SubjectNodeState string `json:"subjectNodeState"`
}
type IntRange struct {
From int `json:"from"`
To int `json:"to"`
}
type TimeRange struct {
From *time.Time `json:"from"`
To *time.Time `json:"to"`
Range string `json:"range,omitempty"`
}
type FilterRanges struct {
Duration *IntRange `json:"duration"`
NumNodes *IntRange `json:"numNodes"`
StartTime *TimeRange `json:"startTime"`
}
type ClusterConfig struct {
Name string `json:"name"`
FilterRanges *FilterRanges `json:"filterRanges"`
}
var Clusters []*ClusterConfig
var Keys ProgramConfig = ProgramConfig{
Addr: "localhost:8080",
DisableAuthentication: false,
EmbedStaticFiles: true,
DBDriver: "sqlite3",
DB: "./var/job.db",
Archive: json.RawMessage(`{\"kind\":\"file\",\"path\":\"./var/job-archive\"}`),
DisableArchive: false,
Validate: false,
SessionMaxAge: "168h",
StopJobsExceedingWalltime: 0,
ShortRunningJobsDuration: 5 * 60,
UiDefaults: map[string]interface{}{
"analysis_view_histogramMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"analysis_view_scatterPlotMetrics": [][]string{{"flops_any", "mem_bw"}, {"flops_any", "cpu_load"}, {"cpu_load", "mem_bw"}},
"job_view_nodestats_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"job_view_polarPlotMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"job_view_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"plot_general_colorBackground": true,
"plot_general_colorscheme": []string{"#00bfff", "#0000ff", "#ff00ff", "#ff0000", "#ff8000", "#ffff00", "#80ff00"},
"plot_general_lineWidth": 3,
"plot_list_jobsPerPage": 50,
"plot_list_selectedMetrics": []string{"cpu_load", "mem_used", "flops_any", "mem_bw"},
"plot_view_plotsPerRow": 3,
"plot_view_showPolarplot": true,
"plot_view_showRoofline": true,
"plot_view_showStatTable": true,
"system_view_selectedMetric": "cpu_load",
"analysis_view_selectedTopEntity": "user",
"analysis_view_selectedTopCategory": "totalWalltime",
"status_view_selectedTopUserCategory": "totalJobs",
"status_view_selectedTopProjectCategory": "totalJobs",
},
}
func Init(flagConfigFile string) {
raw, err := os.ReadFile(flagConfigFile)
if err != nil {
if !os.IsNotExist(err) {
log.Fatalf("CONFIG ERROR: %v", err)
}
} else {
if err := schema.Validate(schema.Config, bytes.NewReader(raw)); err != nil {
log.Fatalf("Validate config: %v\n", err)
}
dec := json.NewDecoder(bytes.NewReader(raw))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
log.Fatalf("could not decode: %v", err)
}
func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
Validate(configSchema, mainConfig)
dec := json.NewDecoder(bytes.NewReader(mainConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
}
if Keys.Clusters == nil || len(Keys.Clusters) < 1 {
log.Fatal("At least one cluster required in config!")
}
Validate(clustersSchema, clusterConfig)
dec = json.NewDecoder(bytes.NewReader(clusterConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&Clusters); err != nil {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
}
if len(Clusters) < 1 {
cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
}
if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
}
}

View File

@@ -1,16 +1,30 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package config
import (
"testing"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
)
func TestInit(t *testing.T) {
fp := "../../configs/config.json"
Init(fp)
ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
} else {
cclog.Abort("Main configuration must be present")
}
if Keys.Addr != "0.0.0.0:443" {
t.Errorf("wrong addr\ngot: %s \nwant: 0.0.0.0:443", Keys.Addr)
}
@@ -18,7 +32,17 @@ func TestInit(t *testing.T) {
func TestInitMinimal(t *testing.T) {
fp := "../../configs/config-demo.json"
Init(fp)
ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
} else {
cclog.Abort("Main configuration must be present")
}
if Keys.Addr != "127.0.0.1:8080" {
t.Errorf("wrong addr\ngot: %s \nwant: 127.0.0.1:8080", Keys.Addr)
}

View File

@@ -0,0 +1,51 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package config
import (
"encoding/json"
"os"
"strings"
)
// DEPRECATED: SUPERSEDED BY NEW USER CONFIG - userConfig.go / web.go
type DefaultMetricsCluster struct {
Name string `json:"name"`
DefaultMetrics string `json:"default_metrics"`
}
type DefaultMetricsConfig struct {
Clusters []DefaultMetricsCluster `json:"clusters"`
}
func LoadDefaultMetricsConfig() (*DefaultMetricsConfig, error) {
filePath := "default_metrics.json"
if _, err := os.Stat(filePath); os.IsNotExist(err) {
return nil, nil
}
data, err := os.ReadFile(filePath)
if err != nil {
return nil, err
}
var cfg DefaultMetricsConfig
if err := json.Unmarshal(data, &cfg); err != nil {
return nil, err
}
return &cfg, nil
}
func ParseMetricsString(s string) []string {
parts := strings.Split(s, ",")
var metrics []string
for _, p := range parts {
trimmed := strings.TrimSpace(p)
if trimmed != "" {
metrics = append(metrics, trimmed)
}
}
return metrics
}

222
internal/config/schema.go Normal file
View File

@@ -0,0 +1,222 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package config
var configSchema = `
{
"type": "object",
"properties": {
"addr": {
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
"type": "string"
},
"apiAllowedIPs": {
"description": "Addresses from which secured API endpoints can be reached",
"type": "array",
"items": {
"type": "string"
}
},
"user": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
},
"group": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
},
"disable-authentication": {
"description": "Disable authentication (for everything: API, Web-UI, ...).",
"type": "boolean"
},
"embed-static-files": {
"description": "If all files in web/frontend/public should be served from within the binary itself (they are embedded) or not.",
"type": "boolean"
},
"static-files": {
"description": "Folder where static assets can be found, if embed-static-files is false.",
"type": "string"
},
"db": {
"description": "Path to SQLite database file (e.g., './var/job.db')",
"type": "string"
},
"disable-archive": {
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
"type": "boolean"
},
"enable-job-taggers": {
"description": "Turn on automatic application and jobclass taggers",
"type": "boolean"
},
"validate": {
"description": "Validate all input json documents against json schema.",
"type": "boolean"
},
"session-max-age": {
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
"type": "string"
},
"https-cert-file": {
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
"type": "string"
},
"https-key-file": {
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
"type": "string"
},
"redirect-http-to": {
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
"type": "string"
},
"stop-jobs-exceeding-walltime": {
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
"type": "integer"
},
"short-running-jobs-duration": {
"description": "Do not show running jobs shorter than X seconds.",
"type": "integer"
},
"emission-constant": {
"description": ".",
"type": "integer"
},
"cron-frequency": {
"description": "Frequency of cron job workers.",
"type": "object",
"properties": {
"duration-worker": {
"description": "Duration Update Worker [Defaults to '5m']",
"type": "string"
},
"footprint-worker": {
"description": "Metric-Footprint Update Worker [Defaults to '10m']",
"type": "string"
}
}
},
"enable-resampling": {
"description": "Enable dynamic zoom in frontend metric plots.",
"type": "object",
"properties": {
"minimumPoints": {
"description": "Minimum points to trigger resampling of time-series data.",
"type": "integer"
},
"trigger": {
"description": "Trigger next zoom level at less than this many visible datapoints.",
"type": "integer"
},
"resolutions": {
"description": "Array of resampling target resolutions, in seconds.",
"type": "array",
"items": {
"type": "integer"
}
}
},
"required": ["trigger", "resolutions"]
},
"upstreamMetricRepository": {
"description": "Global upstream metric repository configuration for metric pull workers",
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
},
"url": {
"type": "string"
},
"token": {
"type": "string"
}
},
"required": ["kind"]
}
},
"required": ["apiAllowedIPs"]
}`
var clustersSchema = `
{
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "The name of the cluster.",
"type": "string"
},
"metricDataRepository": {
"description": "Type of the metric data repository for this cluster",
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
},
"url": {
"type": "string"
},
"token": {
"type": "string"
}
},
"required": ["kind"]
},
"filterRanges": {
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
"type": "object",
"properties": {
"numNodes": {
"description": "UI slider range for number of nodes",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": ["from", "to"]
},
"duration": {
"description": "UI slider range for duration",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": ["from", "to"]
},
"startTime": {
"description": "UI slider range for start time",
"type": "object",
"properties": {
"from": {
"type": "string",
"format": "date-time"
},
"to": {
"type": "null"
}
},
"required": ["from", "to"]
}
},
"required": ["numNodes", "duration", "startTime"]
}
},
"required": ["name", "filterRanges"],
"minItems": 1
}
}`

View File

@@ -0,0 +1,29 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package config
import (
"encoding/json"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/santhosh-tekuri/jsonschema/v5"
)
func Validate(schema string, instance json.RawMessage) {
sch, err := jsonschema.CompileString("schema.json", schema)
if err != nil {
cclog.Fatalf("%#v", err)
}
var v any
if err := json.Unmarshal([]byte(instance), &v); err != nil {
cclog.Fatal(err)
}
if err = sch.Validate(v); err != nil {
cclog.Fatalf("%#v", err)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,6 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package model

View File

@@ -3,24 +3,50 @@
package model
import (
"bytes"
"fmt"
"io"
"strconv"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-lib/schema"
)
type ClusterMetricWithName struct {
Name string `json:"name"`
Unit *schema.Unit `json:"unit,omitempty"`
Timestep int `json:"timestep"`
Data []schema.Float `json:"data"`
}
type ClusterMetrics struct {
NodeCount int `json:"nodeCount"`
Metrics []*ClusterMetricWithName `json:"metrics"`
}
type Count struct {
Name string `json:"name"`
Count int `json:"count"`
}
type EnergyFootprintValue struct {
Hardware string `json:"hardware"`
Metric string `json:"metric"`
Value float64 `json:"value"`
}
type FloatRange struct {
From float64 `json:"from"`
To float64 `json:"to"`
}
type FootprintValue struct {
Name string `json:"name"`
Stat string `json:"stat"`
Value float64 `json:"value"`
}
type Footprints struct {
TimeWeights *TimeWeights `json:"timeWeights"`
Metrics []*MetricFootprints `json:"metrics"`
@@ -38,6 +64,7 @@ type IntRangeOutput struct {
type JobFilter struct {
Tags []string `json:"tags,omitempty"`
DbID []string `json:"dbId,omitempty"`
JobID *StringInput `json:"jobId,omitempty"`
ArrayJobID *int `json:"arrayJobId,omitempty"`
User *StringInput `json:"user,omitempty"`
@@ -45,18 +72,16 @@ type JobFilter struct {
JobName *StringInput `json:"jobName,omitempty"`
Cluster *StringInput `json:"cluster,omitempty"`
Partition *StringInput `json:"partition,omitempty"`
Duration *schema.IntRange `json:"duration,omitempty"`
Duration *config.IntRange `json:"duration,omitempty"`
Energy *FloatRange `json:"energy,omitempty"`
MinRunningFor *int `json:"minRunningFor,omitempty"`
NumNodes *schema.IntRange `json:"numNodes,omitempty"`
NumAccelerators *schema.IntRange `json:"numAccelerators,omitempty"`
NumHWThreads *schema.IntRange `json:"numHWThreads,omitempty"`
StartTime *schema.TimeRange `json:"startTime,omitempty"`
NumNodes *config.IntRange `json:"numNodes,omitempty"`
NumAccelerators *config.IntRange `json:"numAccelerators,omitempty"`
NumHWThreads *config.IntRange `json:"numHWThreads,omitempty"`
StartTime *config.TimeRange `json:"startTime,omitempty"`
State []schema.JobState `json:"state,omitempty"`
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg,omitempty"`
MemBwAvg *FloatRange `json:"memBwAvg,omitempty"`
LoadAvg *FloatRange `json:"loadAvg,omitempty"`
MemUsedMax *FloatRange `json:"memUsedMax,omitempty"`
Exclusive *int `json:"exclusive,omitempty"`
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
Shared *string `json:"shared,omitempty"`
Node *StringInput `json:"node,omitempty"`
}
@@ -78,29 +103,45 @@ type JobMetricWithName struct {
}
type JobResultList struct {
Items []*schema.Job `json:"items"`
Offset *int `json:"offset,omitempty"`
Limit *int `json:"limit,omitempty"`
Count *int `json:"count,omitempty"`
Items []*schema.Job `json:"items"`
Offset *int `json:"offset,omitempty"`
Limit *int `json:"limit,omitempty"`
Count *int `json:"count,omitempty"`
HasNextPage *bool `json:"hasNextPage,omitempty"`
}
type JobStats struct {
ID int `json:"id"`
JobID string `json:"jobId"`
StartTime int `json:"startTime"`
Duration int `json:"duration"`
Cluster string `json:"cluster"`
SubCluster string `json:"subCluster"`
NumNodes int `json:"numNodes"`
NumHWThreads *int `json:"numHWThreads,omitempty"`
NumAccelerators *int `json:"numAccelerators,omitempty"`
Stats []*NamedStats `json:"stats"`
}
type JobsStatistics struct {
ID string `json:"id"`
Name string `json:"name"`
TotalJobs int `json:"totalJobs"`
RunningJobs int `json:"runningJobs"`
ShortJobs int `json:"shortJobs"`
TotalWalltime int `json:"totalWalltime"`
TotalNodes int `json:"totalNodes"`
TotalNodeHours int `json:"totalNodeHours"`
TotalCores int `json:"totalCores"`
TotalCoreHours int `json:"totalCoreHours"`
TotalAccs int `json:"totalAccs"`
TotalAccHours int `json:"totalAccHours"`
HistDuration []*HistoPoint `json:"histDuration"`
HistNumNodes []*HistoPoint `json:"histNumNodes"`
HistNumCores []*HistoPoint `json:"histNumCores"`
HistNumAccs []*HistoPoint `json:"histNumAccs"`
ID string `json:"id"`
Name string `json:"name"`
TotalUsers int `json:"totalUsers"`
TotalJobs int `json:"totalJobs"`
RunningJobs int `json:"runningJobs"`
ShortJobs int `json:"shortJobs"`
TotalWalltime int `json:"totalWalltime"`
TotalNodes int `json:"totalNodes"`
TotalNodeHours int `json:"totalNodeHours"`
TotalCores int `json:"totalCores"`
TotalCoreHours int `json:"totalCoreHours"`
TotalAccs int `json:"totalAccs"`
TotalAccHours int `json:"totalAccHours"`
HistDuration []*HistoPoint `json:"histDuration"`
HistNumNodes []*HistoPoint `json:"histNumNodes"`
HistNumCores []*HistoPoint `json:"histNumCores"`
HistNumAccs []*HistoPoint `json:"histNumAccs"`
HistMetrics []*MetricHistoPoints `json:"histMetrics"`
}
type MetricFootprints struct {
@@ -108,14 +149,83 @@ type MetricFootprints struct {
Data []schema.Float `json:"data"`
}
type MetricHistoPoint struct {
Bin *int `json:"bin,omitempty"`
Count int `json:"count"`
Min *int `json:"min,omitempty"`
Max *int `json:"max,omitempty"`
}
type MetricHistoPoints struct {
Metric string `json:"metric"`
Unit string `json:"unit"`
Stat *string `json:"stat,omitempty"`
Data []*MetricHistoPoint `json:"data,omitempty"`
}
type MetricStatItem struct {
MetricName string `json:"metricName"`
Range *FloatRange `json:"range"`
}
type Mutation struct {
}
type NamedStats struct {
Name string `json:"name"`
Data *schema.MetricStatistics `json:"data"`
}
type NamedStatsWithScope struct {
Name string `json:"name"`
Scope schema.MetricScope `json:"scope"`
Stats []*ScopedStats `json:"stats"`
}
type NodeFilter struct {
Hostname *StringInput `json:"hostname,omitempty"`
Cluster *StringInput `json:"cluster,omitempty"`
Subcluster *StringInput `json:"subcluster,omitempty"`
SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"`
HealthState *string `json:"healthState,omitempty"`
TimeStart *int `json:"timeStart,omitempty"`
}
type NodeMetrics struct {
Host string `json:"host"`
State string `json:"state"`
SubCluster string `json:"subCluster"`
Metrics []*JobMetricWithName `json:"metrics"`
}
type NodeStateResultList struct {
Items []*schema.Node `json:"items"`
Count *int `json:"count,omitempty"`
}
type NodeStates struct {
State string `json:"state"`
Count int `json:"count"`
}
type NodeStatesTimed struct {
State string `json:"state"`
Counts []int `json:"counts"`
Times []int `json:"times"`
}
type NodesResultList struct {
Items []*NodeMetrics `json:"items"`
Offset *int `json:"offset,omitempty"`
Limit *int `json:"limit,omitempty"`
Count *int `json:"count,omitempty"`
TotalNodes *int `json:"totalNodes,omitempty"`
HasNextPage *bool `json:"hasNextPage,omitempty"`
}
type OrderByInput struct {
Field string `json:"field"`
Type string `json:"type"`
Order SortDirectionEnum `json:"order"`
}
@@ -124,6 +234,12 @@ type PageRequest struct {
Page int `json:"page"`
}
type ScopedStats struct {
Hostname string `json:"hostname"`
ID *string `json:"id,omitempty"`
Data *schema.MetricStatistics `json:"data"`
}
type StringInput struct {
Eq *string `json:"eq,omitempty"`
Neq *string `json:"neq,omitempty"`
@@ -134,8 +250,9 @@ type StringInput struct {
}
type TimeRangeOutput struct {
From time.Time `json:"from"`
To time.Time `json:"to"`
Range *string `json:"range,omitempty"`
From time.Time `json:"from"`
To time.Time `json:"to"`
}
type TimeWeights struct {
@@ -153,20 +270,22 @@ type User struct {
type Aggregate string
const (
AggregateUser Aggregate = "USER"
AggregateProject Aggregate = "PROJECT"
AggregateCluster Aggregate = "CLUSTER"
AggregateUser Aggregate = "USER"
AggregateProject Aggregate = "PROJECT"
AggregateCluster Aggregate = "CLUSTER"
AggregateSubcluster Aggregate = "SUBCLUSTER"
)
var AllAggregate = []Aggregate{
AggregateUser,
AggregateProject,
AggregateCluster,
AggregateSubcluster,
}
func (e Aggregate) IsValid() bool {
switch e {
case AggregateUser, AggregateProject, AggregateCluster:
case AggregateUser, AggregateProject, AggregateCluster, AggregateSubcluster:
return true
}
return false
@@ -176,7 +295,7 @@ func (e Aggregate) String() string {
return string(e)
}
func (e *Aggregate) UnmarshalGQL(v interface{}) error {
func (e *Aggregate) UnmarshalGQL(v any) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
@@ -193,11 +312,26 @@ func (e Aggregate) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
func (e *Aggregate) UnmarshalJSON(b []byte) error {
s, err := strconv.Unquote(string(b))
if err != nil {
return err
}
return e.UnmarshalGQL(s)
}
func (e Aggregate) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
e.MarshalGQL(&buf)
return buf.Bytes(), nil
}
type SortByAggregate string
const (
SortByAggregateTotalwalltime SortByAggregate = "TOTALWALLTIME"
SortByAggregateTotaljobs SortByAggregate = "TOTALJOBS"
SortByAggregateTotalusers SortByAggregate = "TOTALUSERS"
SortByAggregateTotalnodes SortByAggregate = "TOTALNODES"
SortByAggregateTotalnodehours SortByAggregate = "TOTALNODEHOURS"
SortByAggregateTotalcores SortByAggregate = "TOTALCORES"
@@ -209,6 +343,7 @@ const (
var AllSortByAggregate = []SortByAggregate{
SortByAggregateTotalwalltime,
SortByAggregateTotaljobs,
SortByAggregateTotalusers,
SortByAggregateTotalnodes,
SortByAggregateTotalnodehours,
SortByAggregateTotalcores,
@@ -219,7 +354,7 @@ var AllSortByAggregate = []SortByAggregate{
func (e SortByAggregate) IsValid() bool {
switch e {
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalusers, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
return true
}
return false
@@ -229,7 +364,7 @@ func (e SortByAggregate) String() string {
return string(e)
}
func (e *SortByAggregate) UnmarshalGQL(v interface{}) error {
func (e *SortByAggregate) UnmarshalGQL(v any) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
@@ -246,6 +381,20 @@ func (e SortByAggregate) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
func (e *SortByAggregate) UnmarshalJSON(b []byte) error {
s, err := strconv.Unquote(string(b))
if err != nil {
return err
}
return e.UnmarshalGQL(s)
}
func (e SortByAggregate) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
e.MarshalGQL(&buf)
return buf.Bytes(), nil
}
type SortDirectionEnum string
const (
@@ -270,7 +419,7 @@ func (e SortDirectionEnum) String() string {
return string(e)
}
func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
func (e *SortDirectionEnum) UnmarshalGQL(v any) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
@@ -286,3 +435,17 @@ func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
func (e SortDirectionEnum) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
func (e *SortDirectionEnum) UnmarshalJSON(b []byte) error {
s, err := strconv.Unquote(string(b))
if err != nil {
return err
}
return e.UnmarshalGQL(s)
}
func (e SortDirectionEnum) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
e.MarshalGQL(&buf)
return buf.Bytes(), nil
}

View File

@@ -1,15 +1,39 @@
package graph
import (
"sync"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/jmoiron/sqlx"
)
// This file will not be regenerated automatically.
//
// It serves as dependency injection for your app, add any dependencies you require here.
var (
initOnce sync.Once
resolverInstance *Resolver
)
type Resolver struct {
DB *sqlx.DB
Repo *repository.JobRepository
}
func Init() {
initOnce.Do(func() {
db := repository.GetConnection()
resolverInstance = &Resolver{
DB: db.DB, Repo: repository.GetJobRepository(),
}
})
}
func GetResolverInstance() *Resolver {
if resolverInstance == nil {
cclog.Fatal("Authentication module not initialized!")
}
return resolverInstance
}

View File

@@ -1,23 +1,29 @@
package graph
// This file will be automatically regenerated based on the schema, any resolver implementations
// This file will be automatically regenerated based on the schema, any resolver
// implementations
// will be copied through when generating and any unknown code will be moved to the end.
// Code generated by github.com/99designs/gqlgen version v0.17.36
// Code generated by github.com/99designs/gqlgen version v0.17.84
import (
"context"
"errors"
"fmt"
"math"
"regexp"
"slices"
"strconv"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
)
// Partitions is the resolver for the partitions field.
@@ -25,26 +31,93 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) (
return r.Repo.Partitions(obj.Name)
}
// StartTime is the resolver for the startTime field.
func (r *jobResolver) StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) {
timestamp := time.Unix(obj.StartTime, 0)
return &timestamp, nil
}
// Tags is the resolver for the tags field.
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
return r.Repo.GetTags(&obj.ID)
return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID)
}
// ConcurrentJobs is the resolver for the concurrentJobs field.
func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) {
if obj.State == schema.JobStateRunning {
obj.Duration = int32(time.Now().Unix() - obj.StartTimeUnix)
}
if obj.Exclusive != 1 && obj.Duration > 600 {
// FIXME: Make the hardcoded duration configurable
if obj.Shared != "none" && obj.Duration > 600 {
return r.Repo.FindConcurrentJobs(ctx, obj)
}
return nil, nil
}
// Footprint is the resolver for the footprint field.
func (r *jobResolver) Footprint(ctx context.Context, obj *schema.Job) ([]*model.FootprintValue, error) {
rawFootprint, err := r.Repo.FetchFootprint(obj)
if err != nil {
cclog.Warn("Error while fetching job footprint data")
return nil, err
}
res := []*model.FootprintValue{}
for name, value := range rawFootprint {
parts := strings.Split(name, "_")
statPart := parts[len(parts)-1]
nameParts := parts[:len(parts)-1]
res = append(res, &model.FootprintValue{
Name: strings.Join(nameParts, "_"),
Stat: statPart,
Value: value,
})
}
return res, err
}
// EnergyFootprint is the resolver for the energyFootprint field.
func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*model.EnergyFootprintValue, error) {
rawEnergyFootprint, err := r.Repo.FetchEnergyFootprint(obj)
if err != nil {
cclog.Warn("Error while fetching job energy footprint data")
return nil, err
}
res := []*model.EnergyFootprintValue{}
for name, value := range rawEnergyFootprint {
// Suboptimal: Nearly hardcoded metric name expectations
matchCPU := regexp.MustCompile(`cpu|Cpu|CPU`)
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
matchCore := regexp.MustCompile(`core|Core|CORE`)
hwType := ""
switch test := name; { // NOtice ';' for var declaration
case matchCPU.MatchString(test):
hwType = "CPU"
case matchAcc.MatchString(test):
hwType = "Accelerator"
case matchMem.MatchString(test):
hwType = "Memory"
case matchCore.MatchString(test):
hwType = "Core"
default:
hwType = "Other"
}
res = append(res, &model.EnergyFootprintValue{
Hardware: hwType,
Metric: name,
Value: value,
})
}
return res, err
}
// MetaData is the resolver for the metaData field.
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) {
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (any, error) {
return r.Repo.FetchMetadata(obj)
}
@@ -53,41 +126,82 @@ func (r *jobResolver) UserData(ctx context.Context, obj *schema.Job) (*model.Use
return repository.GetUserRepository().FetchUserInCtx(ctx, obj.User)
}
// Name is the resolver for the name field.
func (r *metricValueResolver) Name(ctx context.Context, obj *schema.MetricValue) (*string, error) {
panic(fmt.Errorf("not implemented: Name - name"))
}
// CreateTag is the resolver for the createTag field.
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
id, err := r.Repo.CreateTag(typeArg, name)
if err != nil {
log.Warn("Error while creating tag")
return nil, err
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string, scope string) (*schema.Tag, error) {
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && scope == "admin" ||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && scope == "global" ||
user.Username == scope {
// Create in DB
id, err := r.Repo.CreateTag(typeArg, name, scope)
if err != nil {
cclog.Warn("Error while creating tag")
return nil, err
}
return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil
} else {
cclog.Warnf("Not authorized to create tag with scope: %s", scope)
return nil, fmt.Errorf("not authorized to create tag with scope: %s", scope)
}
}
// DeleteTag is the resolver for the deleteTag field.
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
// This Uses ID string <-> ID string, removeTagFromList uses []string <-> []int
panic(fmt.Errorf("not implemented: DeleteTag - deleteTag"))
}
// AddTagsToJob is the resolver for the addTagsToJob field.
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
jid, err := strconv.ParseInt(job, 10, 64)
if err != nil {
log.Warn("Error while adding tag to job")
cclog.Warn("Error while adding tag to job")
return nil, err
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
tid, err := strconv.ParseInt(tagId, 10, 64)
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
log.Warn("Error while parsing tag id")
cclog.Warn("Error while parsing tag id")
return nil, err
}
if tags, err = r.Repo.AddTag(jid, tid); err != nil {
log.Warn("Error while adding tag")
return nil, err
// Test Exists
_, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists {
cclog.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
}
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
user.Username == tscope {
// Add to Job
if tags, err = r.Repo.AddTag(user, jid, tid); err != nil {
cclog.Warn("Error while adding tag")
return nil, err
}
} else {
cclog.Warnf("Not authorized to add tag: %d", tid)
return nil, fmt.Errorf("not authorized to add tag: %d", tid)
}
}
@@ -96,39 +210,127 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
// RemoveTagsFromJob is the resolver for the removeTagsFromJob field.
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
jid, err := strconv.ParseInt(job, 10, 64)
if err != nil {
log.Warn("Error while parsing job id")
cclog.Warn("Error while parsing job id")
return nil, err
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
tid, err := strconv.ParseInt(tagId, 10, 64)
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
log.Warn("Error while parsing tag id")
cclog.Warn("Error while parsing tag id")
return nil, err
}
if tags, err = r.Repo.RemoveTag(jid, tid); err != nil {
log.Warn("Error while removing tag")
return nil, err
// Test Exists
_, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists {
cclog.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
}
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
user.Username == tscope {
// Remove from Job
if tags, err = r.Repo.RemoveTag(user, jid, tid); err != nil {
cclog.Warn("Error while removing tag")
return nil, err
}
} else {
cclog.Warnf("Not authorized to remove tag: %d", tid)
return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
}
}
return tags, nil
}
// RemoveTagFromList is the resolver for the removeTagFromList field.
func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []string) ([]int, error) {
// Needs Contextuser
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
tags := []int{}
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
cclog.Warn("Error while parsing tag id for removal")
return nil, err
}
// Test Exists
_, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists {
cclog.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
}
// Test Access: Admins && Admin Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
// Remove from DB
if err = r.Repo.RemoveTagById(tid); err != nil {
cclog.Warn("Error while removing tag")
return nil, err
} else {
tags = append(tags, int(tid))
}
} else {
cclog.Warnf("Not authorized to remove tag: %d", tid)
return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
}
}
return tags, nil
}
// UpdateConfiguration is the resolver for the updateConfiguration field.
func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string, value string) (*string, error) {
if err := repository.GetUserCfgRepo().UpdateConfig(name, value, repository.GetUserFromContext(ctx)); err != nil {
log.Warn("Error while updating user config")
cclog.Warn("Error while updating user config")
return nil, err
}
return nil, nil
}
// ID is the resolver for the id field.
func (r *nodeResolver) ID(ctx context.Context, obj *schema.Node) (string, error) {
panic(fmt.Errorf("not implemented: ID - id"))
}
// SchedulerState is the resolver for the schedulerState field.
func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (schema.SchedulerState, error) {
if obj.NodeState != "" {
return obj.NodeState, nil
} else {
return "", fmt.Errorf("no SchedulerState (NodeState) on Object")
}
}
// HealthState is the resolver for the healthState field.
func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (string, error) {
panic(fmt.Errorf("not implemented: HealthState - healthState"))
}
// MetaData is the resolver for the metaData field.
func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) {
panic(fmt.Errorf("not implemented: MetaData - metaData"))
}
// Clusters is the resolver for the clusters field.
func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error) {
return archive.Clusters, nil
@@ -136,7 +338,20 @@ func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error)
// Tags is the resolver for the tags field.
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
return r.Repo.GetTags(nil)
return r.Repo.GetTags(repository.GetUserFromContext(ctx), nil)
}
// GlobalMetrics is the resolver for the globalMetrics field.
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
user := repository.GetUserFromContext(ctx)
if user != nil {
if user.HasRole(schema.RoleUser) || user.HasRole(schema.RoleManager) {
return archive.GlobalUserMetricList, nil
}
}
return archive.GlobalMetricList, nil
}
// User is the resolver for the user field.
@@ -148,7 +363,7 @@ func (r *queryResolver) User(ctx context.Context, username string) (*model.User,
func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*model.Count, error) {
data, err := r.Repo.AllocatedNodes(cluster)
if err != nil {
log.Warn("Error while fetching allocated nodes")
cclog.Warn("Error while fetching allocated nodes")
return nil, err
}
@@ -163,17 +378,82 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
return counts, nil
}
// Node is the resolver for the node field.
func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
repo := repository.GetNodeRepository()
numericID, err := strconv.ParseInt(id, 10, 64)
if err != nil {
cclog.Warn("Error while parsing job id")
return nil, err
}
return repo.GetNodeByID(numericID, false)
}
// Nodes is the resolver for the nodes field.
func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
repo := repository.GetNodeRepository()
nodes, err := repo.QueryNodes(ctx, filter, nil, order) // Ignore Paging, Order Unused
count := len(nodes)
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
}
// NodeStates is the resolver for the nodeStates field.
func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) {
repo := repository.GetNodeRepository()
stateCounts, serr := repo.CountStates(ctx, filter, "node_state")
if serr != nil {
cclog.Warnf("Error while counting nodeStates: %s", serr.Error())
return nil, serr
}
healthCounts, herr := repo.CountStates(ctx, filter, "health_state")
if herr != nil {
cclog.Warnf("Error while counting healthStates: %s", herr.Error())
return nil, herr
}
allCounts := append(stateCounts, healthCounts...)
return allCounts, nil
}
// NodeStatesTimed is the resolver for the nodeStatesTimed field.
func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.NodeFilter, typeArg string) ([]*model.NodeStatesTimed, error) {
repo := repository.GetNodeRepository()
if typeArg == "node" {
stateCounts, serr := repo.CountStatesTimed(ctx, filter, "node_state")
if serr != nil {
cclog.Warnf("Error while counting nodeStates in time: %s", serr.Error())
return nil, serr
}
return stateCounts, nil
}
if typeArg == "health" {
healthCounts, herr := repo.CountStatesTimed(ctx, filter, "health_state")
if herr != nil {
cclog.Warnf("Error while counting healthStates in time: %s", herr.Error())
return nil, herr
}
return healthCounts, nil
}
return nil, errors.New("unknown Node State Query Type")
}
// Job is the resolver for the job field.
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
numericId, err := strconv.ParseInt(id, 10, 64)
numericID, err := strconv.ParseInt(id, 10, 64)
if err != nil {
log.Warn("Error while parsing job id")
cclog.Warn("Error while parsing job id")
return nil, err
}
job, err := r.Repo.FindById(numericId)
job, err := r.Repo.FindByID(ctx, numericID)
if err != nil {
log.Warn("Error while finding job by id")
cclog.Warn("Error while finding job by id")
return nil, err
}
@@ -187,16 +467,26 @@ func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error)
}
// JobMetrics is the resolver for the jobMetrics field.
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.JobMetricWithName, error) {
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope, resolution *int) ([]*model.JobMetricWithName, error) {
if resolution == nil { // Load from Config
if config.Keys.EnableResampling != nil {
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
resolution = &defaultRes
} else { // Set 0 (Loads configured metric timestep)
defaultRes := 0
resolution = &defaultRes
}
}
job, err := r.Query().Job(ctx, id)
if err != nil {
log.Warn("Error while querying job for metrics")
cclog.Warn("Error while querying job for metrics")
return nil, err
}
data, err := metricdata.LoadData(job, metrics, scopes, ctx)
data, err := metricdispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
if err != nil {
log.Warn("Error while loading job data")
cclog.Warn("Error while loading job data")
return nil, err
}
@@ -214,9 +504,67 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
return res, err
}
// JobsFootprints is the resolver for the jobsFootprints field.
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
return r.jobsFootprints(ctx, filter, metrics)
// JobStats is the resolver for the jobStats field.
func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []string) ([]*model.NamedStats, error) {
job, err := r.Query().Job(ctx, id)
if err != nil {
cclog.Warnf("Error while querying job %s for metadata", id)
return nil, err
}
data, err := metricdispatcher.LoadJobStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("Error while loading jobStats data for job id %s", id)
return nil, err
}
res := []*model.NamedStats{}
for name, md := range data {
res = append(res, &model.NamedStats{
Name: name,
Data: &md,
})
}
return res, err
}
// ScopedJobStats is the resolver for the scopedJobStats field.
func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.NamedStatsWithScope, error) {
job, err := r.Query().Job(ctx, id)
if err != nil {
cclog.Warnf("Error while querying job %s for metadata", id)
return nil, err
}
data, err := metricdispatcher.LoadScopedJobStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
return nil, err
}
res := make([]*model.NamedStatsWithScope, 0)
for name, scoped := range data {
for scope, stats := range scoped {
mdlStats := make([]*model.ScopedStats, 0)
for _, stat := range stats {
mdlStats = append(mdlStats, &model.ScopedStats{
Hostname: stat.Hostname,
ID: stat.Id,
Data: stat.Data,
})
}
res = append(res, &model.NamedStatsWithScope{
Name: name,
Scope: scope,
Stats: mdlStats,
})
}
}
return res, nil
}
// Jobs is the resolver for the jobs field.
@@ -230,25 +578,47 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
jobs, err := r.Repo.QueryJobs(ctx, filter, page, order)
if err != nil {
log.Warn("Error while querying jobs")
cclog.Warn("Error while querying jobs")
return nil, err
}
count, err := r.Repo.CountJobs(ctx, filter)
if err != nil {
log.Warn("Error while counting jobs")
cclog.Warn("Error while counting jobs")
return nil, err
}
return &model.JobResultList{Items: jobs, Count: &count}, nil
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
// Users can decide in frontend to use continuous scroll, even if app-default is paging!
/*
Example Page 4 @ 10 IpP : Does item 41 exist?
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
*/
nextPage := &model.PageRequest{
ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1),
}
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
if err != nil {
cclog.Warn("Error while querying next jobs")
return nil, err
}
hasNextPage := len(nextJobs) == 1
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
}
// JobsStatistics is the resolver for the jobsStatistics field.
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate, numDurationBins *string, numMetricBins *int) ([]*model.JobsStatistics, error) {
var err error
var stats []*model.JobsStatistics
if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
// Top Level Defaults
defaultDurationBins := "1h"
defaultMetricBins := 10
if requireField(ctx, "totalJobs") || requireField(ctx, "totalUsers") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") {
if groupBy == nil {
stats, err = r.Repo.JobsStats(ctx, filter)
@@ -281,8 +651,13 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
}
if requireField(ctx, "histDuration") || requireField(ctx, "histNumNodes") || requireField(ctx, "histNumCores") || requireField(ctx, "histNumAccs") {
if numDurationBins == nil {
numDurationBins = &defaultDurationBins
}
if groupBy == nil {
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0])
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0], numDurationBins)
if err != nil {
return nil, err
}
@@ -291,9 +666,81 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
}
}
if requireField(ctx, "histMetrics") {
if numMetricBins == nil {
numMetricBins = &defaultMetricBins
}
if groupBy == nil {
stats[0], err = r.Repo.AddMetricHistograms(ctx, filter, metrics, stats[0], numMetricBins)
if err != nil {
return nil, err
}
} else {
return nil, errors.New("metric histograms only implemented without groupBy argument")
}
}
return stats, nil
}
// JobsMetricStats is the resolver for the jobsMetricStats field.
func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.JobFilter, metrics []string) ([]*model.JobStats, error) {
// No Paging, Fixed Order by StartTime ASC
order := &model.OrderByInput{
Field: "startTime",
Type: "col",
Order: "ASC",
}
jobs, err := r.Repo.QueryJobs(ctx, filter, nil, order)
if err != nil {
cclog.Warn("Error while querying jobs for comparison")
return nil, err
}
res := []*model.JobStats{}
for _, job := range jobs {
data, err := metricdispatcher.LoadJobStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
continue
// return nil, err
}
sres := []*model.NamedStats{}
for name, md := range data {
sres = append(sres, &model.NamedStats{
Name: name,
Data: &md,
})
}
numThreadsInt := int(job.NumHWThreads)
numAccsInt := int(job.NumAcc)
res = append(res, &model.JobStats{
ID: int(*job.ID),
JobID: strconv.Itoa(int(job.JobID)),
StartTime: int(job.StartTime),
Duration: int(job.Duration),
Cluster: job.Cluster,
SubCluster: job.SubCluster,
NumNodes: int(job.NumNodes),
NumHWThreads: &numThreadsInt,
NumAccelerators: &numAccsInt,
Stats: sres,
})
}
return res, err
}
// JobsFootprints is the resolver for the jobsFootprints field.
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
// NOTE: Legacy Naming! This resolver is for normalized histograms in analysis view only - *Not* related to DB "footprint" column!
return r.jobsFootprints(ctx, filter, metrics)
}
// RooflineHeatmap is the resolver for the rooflineHeatmap field.
func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
return r.rooflineHeatmap(ctx, filter, rows, cols, minX, minY, maxX, maxY)
@@ -302,8 +749,8 @@ func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.Job
// NodeMetrics is the resolver for the nodeMetrics field.
func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) ([]*model.NodeMetrics, error) {
user := repository.GetUserFromContext(ctx)
if user != nil && !user.HasRole(schema.RoleAdmin) {
return nil, errors.New("you need to be an administrator for this query")
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
return nil, errors.New("you need to be administrator or support staff for this query")
}
if metrics == nil {
@@ -312,19 +759,26 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
}
}
data, err := metricdata.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
data, err := metricdispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
log.Warn("Error while loading node data")
cclog.Warn("error while loading node data")
return nil, err
}
nodeRepo := repository.GetNodeRepository()
stateMap, _ := nodeRepo.MapNodes(cluster)
nodeMetrics := make([]*model.NodeMetrics, 0, len(data))
for hostname, metrics := range data {
host := &model.NodeMetrics{
Host: hostname,
State: stateMap[hostname],
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
}
host.SubCluster, _ = archive.GetSubClusterByNode(cluster, hostname)
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
if err != nil {
cclog.Warnf("error in nodeMetrics resolver: %s", err)
}
for metric, scopedMetrics := range metrics {
for _, scopedMetric := range scopedMetrics {
@@ -342,6 +796,152 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return nodeMetrics, nil
}
// NodeMetricsList is the resolver for the nodeMetricsList field.
func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, subCluster string, stateFilter string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) (*model.NodesResultList, error) {
if resolution == nil { // Load from Config
if config.Keys.EnableResampling != nil {
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
resolution = &defaultRes
} else { // Set 0 (Loads configured metric timestep)
defaultRes := 0
resolution = &defaultRes
}
}
user := repository.GetUserFromContext(ctx)
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
return nil, errors.New("you need to be administrator or support staff for this query")
}
nodeRepo := repository.GetNodeRepository()
nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page)
if nerr != nil {
return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList")
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
}
data, err := metricdispatcher.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
if err != nil {
cclog.Warn("error while loading node data (Resolver.NodeMetricsList")
return nil, err
}
nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
for hostname, metrics := range data {
host := &model.NodeMetrics{
Host: hostname,
State: stateMap[hostname],
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
}
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
if err != nil {
cclog.Warnf("error in nodeMetrics resolver: %s", err)
}
for metric, scopedMetrics := range metrics {
for scope, scopedMetric := range scopedMetrics {
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
Name: metric,
Scope: scope,
Metric: scopedMetric,
})
}
}
nodeMetricsList = append(nodeMetricsList, host)
}
nodeMetricsListResult := &model.NodesResultList{
Items: nodeMetricsList,
TotalNodes: &countNodes,
HasNextPage: &hasNextPage,
}
return nodeMetricsListResult, nil
}
// ClusterMetrics is the resolver for the clusterMetrics field.
func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metrics []string, from time.Time, to time.Time) (*model.ClusterMetrics, error) {
user := repository.GetUserFromContext(ctx)
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
return nil, errors.New("you need to be administrator or support staff for this query")
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
}
// 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow
scopes := []schema.MetricScope{"node"}
data, err := metricdispatcher.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
if err != nil {
cclog.Warn("error while loading node data")
return nil, err
}
clusterMetricData := make([]*model.ClusterMetricWithName, 0)
clusterMetrics := model.ClusterMetrics{NodeCount: 0, Metrics: clusterMetricData}
collectorTimestep := make(map[string]int)
collectorUnit := make(map[string]schema.Unit)
collectorData := make(map[string][]schema.Float)
for _, metrics := range data {
clusterMetrics.NodeCount += 1
for metric, scopedMetrics := range metrics {
_, ok := collectorData[metric]
if !ok {
collectorData[metric] = make([]schema.Float, 0)
for _, scopedMetric := range scopedMetrics {
// Collect Info
collectorTimestep[metric] = scopedMetric.Timestep
collectorUnit[metric] = scopedMetric.Unit
// Collect Initial Data
for _, ser := range scopedMetric.Series {
collectorData[metric] = append(collectorData[metric], ser.Data...)
}
}
} else {
// Sum up values by index
for _, scopedMetric := range scopedMetrics {
// For This Purpose (Cluster_Wide-Sum of Node Metrics) OK
for _, ser := range scopedMetric.Series {
for i, val := range ser.Data {
collectorData[metric][i] += val
}
}
}
}
}
}
for metricName, data := range collectorData {
cu := collectorUnit[metricName]
roundedData := make([]schema.Float, 0)
for _, val := range data {
roundedData = append(roundedData, schema.Float((math.Round(float64(val)*100.0) / 100.0)))
}
cm := model.ClusterMetricWithName{
Name: metricName,
Unit: &cu,
Timestep: collectorTimestep[metricName],
Data: roundedData,
}
clusterMetrics.Metrics = append(clusterMetrics.Metrics, &cm)
}
return &clusterMetrics, nil
}
// NumberOfNodes is the resolver for the numberOfNodes field.
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
nodeList, err := archive.ParseNodeList(obj.Nodes)
@@ -357,17 +957,27 @@ func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver
// Job returns generated.JobResolver implementation.
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
// MetricValue returns generated.MetricValueResolver implementation.
func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricValueResolver{r} }
// Mutation returns generated.MutationResolver implementation.
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
// Node returns generated.NodeResolver implementation.
func (r *Resolver) Node() generated.NodeResolver { return &nodeResolver{r} }
// Query returns generated.QueryResolver implementation.
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
// SubCluster returns generated.SubClusterResolver implementation.
func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} }
type clusterResolver struct{ *Resolver }
type jobResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver }
type queryResolver struct{ *Resolver }
type subClusterResolver struct{ *Resolver }
type (
clusterResolver struct{ *Resolver }
jobResolver struct{ *Resolver }
metricValueResolver struct{ *Resolver }
mutationResolver struct{ *Resolver }
nodeResolver struct{ *Resolver }
queryResolver struct{ *Resolver }
subClusterResolver struct{ *Resolver }
)

View File

@@ -1,20 +1,21 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package graph
import (
"context"
"fmt"
"math"
"slices"
"github.com/99designs/gqlgen/graphql"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
// "github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
)
const MAX_JOBS_FOR_ANALYSIS = 500
@@ -24,11 +25,11 @@ func (r *queryResolver) rooflineHeatmap(
ctx context.Context,
filter []*model.JobFilter,
rows int, cols int,
minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
minX float64, minY float64, maxX float64, maxY float64,
) ([][]float64, error) {
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
if err != nil {
log.Error("Error while querying jobs for roofline")
cclog.Error("Error while querying jobs for roofline")
return nil, err
}
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
@@ -47,15 +48,22 @@ func (r *queryResolver) rooflineHeatmap(
continue
}
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
// metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
// resolution := 0
// for _, mc := range metricConfigs {
// resolution = max(resolution, mc.Timestep)
// }
jobdata, err := metricdispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
if err != nil {
log.Errorf("Error while loading roofline metrics for job %d", job.ID)
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
return nil, err
}
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil {
log.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
continue
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
}
@@ -63,7 +71,7 @@ func (r *queryResolver) rooflineHeatmap(
flops, ok1 := flops_["node"]
membw, ok2 := membw_["node"]
if !ok1 || !ok2 {
log.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
cclog.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
continue
// TODO/FIXME:
// return nil, errors.New("GRAPH/UTIL > todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
@@ -98,7 +106,7 @@ func (r *queryResolver) rooflineHeatmap(
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
if err != nil {
log.Error("Error while querying jobs for footprint")
cclog.Error("Error while querying jobs for footprint")
return nil, err
}
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
@@ -120,8 +128,8 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
continue
}
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
log.Error("Error while loading averages for footprint")
if err := metricdispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
cclog.Error("Error while loading averages for footprint")
return nil, err
}
@@ -179,11 +187,5 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
func requireField(ctx context.Context, name string) bool {
fields := graphql.CollectAllFields(ctx)
for _, f := range fields {
if f == name {
return true
}
}
return false
return slices.Contains(fields, name)
}

132
internal/importer/README.md Normal file
View File

@@ -0,0 +1,132 @@
# Importer Package
The `importer` package provides functionality for importing job data into the ClusterCockpit database from archived job files.
## Overview
This package supports two primary import workflows:
1. **Bulk Database Initialization** - Reinitialize the entire job database from archived jobs
2. **Individual Job Import** - Import specific jobs from metadata/data file pairs
Both workflows enrich job metadata by calculating performance footprints and energy consumption metrics before persisting to the database.
## Main Entry Points
### InitDB()
Reinitializes the job database from all archived jobs.
```go
if err := importer.InitDB(); err != nil {
log.Fatal(err)
}
```
This function:
- Flushes existing job, tag, and jobtag tables
- Iterates through all jobs in the configured archive
- Enriches each job with calculated metrics
- Inserts jobs into the database in batched transactions (100 jobs per batch)
- Continues on individual job failures, logging errors
**Use Case**: Initial database setup or complete database rebuild from archive.
### HandleImportFlag(flag string)
Imports jobs from specified file pairs.
```go
// Format: "<meta.json>:<data.json>[,<meta2.json>:<data2.json>,...]"
flag := "/path/to/meta.json:/path/to/data.json"
if err := importer.HandleImportFlag(flag); err != nil {
log.Fatal(err)
}
```
This function:
- Parses the comma-separated file pairs
- Validates metadata and job data against schemas (if validation enabled)
- Enriches each job with footprints and energy metrics
- Imports jobs into both the archive and database
- Fails fast on the first error
**Use Case**: Importing specific jobs from external sources or manual job additions.
## Job Enrichment
Both import workflows use `enrichJobMetadata()` to calculate:
### Performance Footprints
Performance footprints are calculated from metric averages based on the subcluster configuration:
```go
job.Footprint["mem_used_avg"] = 45.2 // GB
job.Footprint["cpu_load_avg"] = 0.87 // percentage
```
### Energy Metrics
Energy consumption is calculated from power metrics using the formula:
```
Energy (kWh) = (Power (W) × Duration (s) / 3600) / 1000
```
For each energy metric:
```go
job.EnergyFootprint["acc_power"] = 12.5 // kWh
job.Energy = 150.2 // Total energy in kWh
```
**Note**: Energy calculations for metrics with unit "energy" (Joules) are not yet implemented.
## Data Validation
### SanityChecks(job *schema.Job)
Validates job metadata before database insertion:
- Cluster exists in configuration
- Subcluster is valid (assigns if needed)
- Job state is valid
- Resources and user fields are populated
- Node counts and hardware thread counts are positive
- Resource count matches declared node count
## Normalization Utilities
The package includes utilities for normalizing metric values to appropriate SI prefixes:
### Normalize(avg float64, prefix string)
Adjusts values and SI prefixes for readability:
```go
factor, newPrefix := importer.Normalize(2048.0, "M")
// Converts 2048 MB → ~2.0 GB
// Returns: factor for conversion, "G"
```
This is useful for automatically scaling metrics (e.g., memory, storage) to human-readable units.
## Dependencies
- `github.com/ClusterCockpit/cc-backend/internal/repository` - Database operations
- `github.com/ClusterCockpit/cc-backend/pkg/archive` - Job archive access
- `github.com/ClusterCockpit/cc-lib/schema` - Job schema definitions
- `github.com/ClusterCockpit/cc-lib/ccLogger` - Logging
- `github.com/ClusterCockpit/cc-lib/ccUnits` - SI unit handling
## Error Handling
- **InitDB**: Continues processing on individual job failures, logs errors, returns summary
- **HandleImportFlag**: Fails fast on first error, returns immediately
- Both functions log detailed error context for debugging
## Performance
- **Transaction Batching**: InitDB processes jobs in batches of 100 for optimal database performance
- **Tag Caching**: Tag IDs are cached during import to minimize database queries
- **Progress Reporting**: InitDB prints progress updates during bulk operations

View File

@@ -1,5 +1,5 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
@@ -10,16 +10,30 @@ import (
"fmt"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
)
// Import all jobs specified as `<path-to-meta.json>:<path-to-data.json>,...`
// HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.
//
// The flag format is: "<path-to-meta.json>:<path-to-data.json>[,<path-to-meta2.json>:<path-to-data2.json>,...]"
//
// For each job pair, this function:
// 1. Reads and validates the metadata JSON file (schema.Job)
// 2. Reads and validates the job data JSON file (schema.JobData)
// 3. Enriches the job with calculated footprints and energy metrics
// 4. Validates the job using SanityChecks()
// 5. Imports the job into the archive
// 6. Inserts the job into the database with associated tags
//
// Schema validation is performed if config.Keys.Validate is true.
//
// Returns an error if file reading, validation, enrichment, or database operations fail.
// The function stops processing on the first error encountered.
func HandleImportFlag(flag string) error {
r := repository.GetJobRepository()
@@ -31,7 +45,7 @@ func HandleImportFlag(flag string) error {
raw, err := os.ReadFile(files[0])
if err != nil {
log.Warn("Error while reading metadata file for import")
cclog.Warn("Error while reading metadata file for import")
return err
}
@@ -42,15 +56,18 @@ func HandleImportFlag(flag string) error {
}
dec := json.NewDecoder(bytes.NewReader(raw))
dec.DisallowUnknownFields()
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
if err = dec.Decode(&jobMeta); err != nil {
log.Warn("Error while decoding raw json metadata for import")
job := schema.Job{
Shared: "none",
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
if err = dec.Decode(&job); err != nil {
cclog.Warn("Error while decoding raw json metadata for import")
return err
}
raw, err = os.ReadFile(files[1])
if err != nil {
log.Warn("Error while reading jobdata file for import")
cclog.Warn("Error while reading jobdata file for import")
return err
}
@@ -63,72 +80,41 @@ func HandleImportFlag(flag string) error {
dec.DisallowUnknownFields()
jobData := schema.JobData{}
if err = dec.Decode(&jobData); err != nil {
log.Warn("Error while decoding raw json jobdata for import")
cclog.Warn("Error while decoding raw json jobdata for import")
return err
}
// checkJobData(&jobData)
job.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
// if _, err = r.Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
// if err != nil {
// log.Warn("Error while finding job in jobRepository")
// return err
// }
//
// return fmt.Errorf("REPOSITORY/INIT > a job with that jobId, cluster and startTime does already exist")
// }
//
job := schema.Job{
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
StartTimeUnix: jobMeta.StartTime,
}
// TODO: Other metrics...
job.LoadAvg = loadJobStat(&jobMeta, "cpu_load")
job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any")
job.MemUsedMax = loadJobStat(&jobMeta, "mem_used")
job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw")
job.NetBwAvg = loadJobStat(&jobMeta, "net_bw")
job.FileBwAvg = loadJobStat(&jobMeta, "file_bw")
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
log.Warn("Error while marshaling job resources")
return err
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
log.Warn("Error while marshaling job metadata")
if err = enrichJobMetadata(&job); err != nil {
cclog.Errorf("Error enriching job metadata: %v", err)
return err
}
if err = SanityChecks(&job.BaseJob); err != nil {
log.Warn("BaseJob SanityChecks failed")
if err = SanityChecks(&job); err != nil {
cclog.Warn("BaseJob SanityChecks failed")
return err
}
if err = archive.GetHandle().ImportJob(&jobMeta, &jobData); err != nil {
log.Error("Error while importing job")
if err = archive.GetHandle().ImportJob(&job, &jobData); err != nil {
cclog.Error("Error while importing job")
return err
}
id, err := r.InsertJob(&job)
if err != nil {
log.Warn("Error while job db insert")
cclog.Warn("Error while job db insert")
return err
}
for _, tag := range job.Tags {
if _, err := r.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
log.Error("Error while adding or creating tag")
if err := r.ImportTag(id, tag.Type, tag.Name, tag.Scope); err != nil {
cclog.Error("Error while adding or creating tag on import")
return err
}
}
log.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
cclog.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
}
return nil
}

View File

@@ -1,5 +1,5 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer_test
@@ -16,9 +16,12 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
)
// copyFile copies a file from source path to destination path.
// Used by tests to set up test fixtures.
func copyFile(s string, d string) error {
r, err := os.Open(s)
if err != nil {
@@ -34,21 +37,29 @@ func copyFile(s string, d string) error {
return nil
}
// setup initializes a test environment for importer tests.
//
// Creates a temporary directory with:
// - A test job archive with cluster configuration
// - A SQLite database initialized with schema
// - Configuration files loaded
//
// Returns a JobRepository instance for test assertions.
func setup(t *testing.T) *repository.JobRepository {
const testconfig = `{
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"apiAllowedIPs": [
"*"
]},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"jwts": {
"max-age": "2m"
},
"clusters": [
{
"name": "testcluster",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
@@ -57,7 +68,6 @@ func setup(t *testing.T) *repository.JobRepository {
},
{
"name": "fritz",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 944 },
"duration": { "from": 0, "to": 86400 },
@@ -66,7 +76,6 @@ func setup(t *testing.T) *repository.JobRepository {
},
{
"name": "taurus",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 4000 },
"duration": { "from": 0, "to": 604800 },
@@ -75,18 +84,18 @@ func setup(t *testing.T) *repository.JobRepository {
}
]}`
log.Init("info", true)
cclog.Init("info", true)
tmpdir := t.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
if err := os.Mkdir(jobarchive, 0777); err != nil {
if err := os.Mkdir(jobarchive, 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
t.Fatal(err)
}
fritzArchive := filepath.Join(tmpdir, "job-archive", "fritz")
if err := os.Mkdir(fritzArchive, 0777); err != nil {
if err := os.Mkdir(fritzArchive, 0o777); err != nil {
t.Fatal(err)
}
if err := copyFile(filepath.Join("testdata", "cluster-fritz.json"),
@@ -95,17 +104,29 @@ func setup(t *testing.T) *repository.JobRepository {
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB("sqlite3", dbfilepath)
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
cfgFilePath := filepath.Join(tmpdir, "config.json")
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0666); err != nil {
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
t.Fatal(err)
}
config.Init(cfgFilePath)
ccconf.Init(cfgFilePath)
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
config.Init(cfg, clustercfg)
} else {
t.Fatal("Cluster configuration must be present")
}
} else {
t.Fatal("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
@@ -116,6 +137,7 @@ func setup(t *testing.T) *repository.JobRepository {
return repository.GetJobRepository()
}
// Result represents the expected test result for job import verification.
type Result struct {
JobId int64
Cluster string
@@ -123,6 +145,8 @@ type Result struct {
Duration int32
}
// readResult reads the expected test result from a golden file.
// Golden files contain the expected job attributes after import.
func readResult(t *testing.T, testname string) Result {
var r Result
@@ -140,6 +164,13 @@ func readResult(t *testing.T, testname string) Result {
return r
}
// TestHandleImportFlag tests the HandleImportFlag function with various job import scenarios.
//
// The test uses golden files in testdata/ to verify that jobs are correctly:
// - Parsed from metadata and data JSON files
// - Enriched with footprints and energy metrics
// - Inserted into the database
// - Retrievable with correct attributes
func TestHandleImportFlag(t *testing.T) {
r := setup(t)
@@ -163,7 +194,7 @@ func TestHandleImportFlag(t *testing.T) {
}
result := readResult(t, testname)
job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime)
job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime)
if err != nil {
t.Fatal(err)
}

View File

@@ -1,40 +1,68 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package importer provides functionality for importing job data into the ClusterCockpit database.
//
// The package supports two primary use cases:
// 1. Bulk database initialization from archived jobs via InitDB()
// 2. Individual job import from file pairs via HandleImportFlag()
//
// Both operations enrich job metadata by calculating footprints and energy metrics
// before persisting to the database.
package importer
import (
"encoding/json"
"fmt"
"math"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
)
// Delete the tables "job", "tag" and "jobtag" from the database and
// repopulate them using the jobs found in `archive`.
const (
addTagQuery = "INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)"
setTagQuery = "INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)"
)
// InitDB reinitializes the job database from archived job data.
//
// This function performs the following operations:
// 1. Flushes existing job, tag, and jobtag tables
// 2. Iterates through all jobs in the archive
// 3. Enriches each job with calculated footprints and energy metrics
// 4. Inserts jobs and tags into the database in batched transactions
//
// Jobs are processed in batches of 100 for optimal performance. The function
// continues processing even if individual jobs fail, logging errors and
// returning a summary at the end.
//
// Returns an error if database initialization, transaction management, or
// critical operations fail. Individual job failures are logged but do not
// stop the overall import process.
func InitDB() error {
r := repository.GetJobRepository()
if err := r.Flush(); err != nil {
log.Errorf("repository initDB(): %v", err)
cclog.Errorf("repository initDB(): %v", err)
return err
}
starttime := time.Now()
log.Print("Building job table...")
cclog.Print("Building job table...")
t, err := r.TransactionInit()
if err != nil {
log.Warn("Error while initializing SQL transactions")
cclog.Warn("Error while initializing SQL transactions")
return err
}
tags := make(map[string]int64)
// Not using log.Print because we want the line to end with `\r` and
// Not using cclog.Print because we want the line to end with `\r` and
// this function is only ever called when a special command line flag
// is passed anyways.
fmt.Printf("%d jobs inserted...\r", 0)
@@ -46,92 +74,195 @@ func InitDB() error {
for jobContainer := range ar.Iter(false) {
jobMeta := jobContainer.Meta
if jobMeta == nil {
cclog.Warn("skipping job with nil metadata")
errorOccured++
continue
}
// Bundle 100 inserts into one transaction for better performance
if i%100 == 0 {
r.TransactionCommit(t)
if i > 0 {
if err := t.Commit(); err != nil {
cclog.Errorf("transaction commit error: %v", err)
return err
}
// Start a new transaction for the next batch
t, err = r.TransactionInit()
if err != nil {
cclog.Errorf("transaction init error: %v", err)
return err
}
}
fmt.Printf("%d jobs inserted...\r", i)
}
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
job := schema.Job{
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
StartTimeUnix: jobMeta.StartTime,
}
// TODO: Other metrics...
job.LoadAvg = loadJobStat(jobMeta, "cpu_load")
job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any")
job.MemUsedMax = loadJobStat(jobMeta, "mem_used")
job.MemBwAvg = loadJobStat(jobMeta, "mem_bw")
job.NetBwAvg = loadJobStat(jobMeta, "net_bw")
job.FileBwAvg = loadJobStat(jobMeta, "file_bw")
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
log.Errorf("repository initDB(): %v", err)
if err := enrichJobMetadata(jobMeta); err != nil {
cclog.Errorf("repository initDB(): %v", err)
errorOccured++
continue
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
log.Errorf("repository initDB(): %v", err)
if err := SanityChecks(jobMeta); err != nil {
cclog.Errorf("repository initDB(): %v", err)
errorOccured++
continue
}
if err := SanityChecks(&job.BaseJob); err != nil {
log.Errorf("repository initDB(): %v", err)
id, jobErr := r.TransactionAddNamed(t,
repository.NamedJobInsert, jobMeta)
if jobErr != nil {
cclog.Errorf("repository initDB(): %v", jobErr)
errorOccured++
continue
}
id, err := r.TransactionAdd(t, job)
if err != nil {
log.Errorf("repository initDB(): %v", err)
errorOccured++
continue
}
// Job successfully inserted, increment counter
i += 1
for _, tag := range job.Tags {
for _, tag := range jobMeta.Tags {
tagstr := tag.Name + ":" + tag.Type
tagId, ok := tags[tagstr]
tagID, ok := tags[tagstr]
if !ok {
tagId, err = r.TransactionAddTag(t, tag)
var err error
tagID, err = r.TransactionAdd(t,
addTagQuery,
tag.Name, tag.Type)
if err != nil {
log.Errorf("Error adding tag: %v", err)
cclog.Errorf("Error adding tag: %v", err)
errorOccured++
continue
}
tags[tagstr] = tagId
tags[tagstr] = tagID
}
r.TransactionSetTag(t, id, tagId)
}
if err == nil {
i += 1
r.TransactionAdd(t,
setTagQuery,
id, tagID)
}
}
if errorOccured > 0 {
log.Warnf("Error in import of %d jobs!", errorOccured)
cclog.Warnf("Error in import of %d jobs!", errorOccured)
}
r.TransactionEnd(t)
log.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
cclog.Infof("A total of %d jobs have been registered in %.3f seconds.", i, time.Since(starttime).Seconds())
return nil
}
// This function also sets the subcluster if necessary!
func SanityChecks(job *schema.BaseJob) error {
// enrichJobMetadata calculates and populates job footprints, energy metrics, and serialized fields.
//
// This function performs the following enrichment operations:
// 1. Calculates job footprint metrics based on the subcluster configuration
// 2. Computes energy footprint and total energy consumption in kWh
// 3. Marshals footprints, resources, and metadata into JSON for database storage
//
// The function expects the job's MonitoringStatus and SubCluster to be already set.
// Energy calculations convert power metrics (Watts) to energy (kWh) using the formula:
//
// Energy (kWh) = (Power (W) * Duration (s) / 3600) / 1000
//
// Returns an error if subcluster retrieval, metric indexing, or JSON marshaling fails.
func enrichJobMetadata(job *schema.Job) error {
sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("cannot get subcluster: %s", err.Error())
return err
}
job.Footprint = make(map[string]float64)
for _, fp := range sc.Footprint {
statType := "avg"
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
statType = sc.MetricConfig[i].Footprint
}
name := fmt.Sprintf("%s_%s", fp, statType)
job.Footprint[name] = repository.LoadJobStat(job, fp, statType)
}
job.RawFootprint, err = json.Marshal(job.Footprint)
if err != nil {
cclog.Warn("Error while marshaling job footprint")
return err
}
job.EnergyFootprint = make(map[string]float64)
// Total Job Energy Outside Loop
totalEnergy := 0.0
for _, fp := range sc.EnergyFootprint {
// Always Init Metric Energy Inside Loop
metricEnergy := 0.0
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
// Note: For DB data, calculate and save as kWh
switch sc.MetricConfig[i].Energy {
case "energy": // this metric has energy as unit (Joules)
cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", job.JobID, job.Cluster, fp)
// FIXME: Needs sum as stats type
case "power": // this metric has power as unit (Watt)
// Energy: Power (in Watts) * Time (in Seconds)
// Unit: (W * (s / 3600)) / 1000 = kWh
// Round 2 Digits: round(Energy * 100) / 100
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
rawEnergy := ((repository.LoadJobStat(job, fp, "avg") * float64(job.NumNodes)) * (float64(job.Duration) / 3600.0)) / 1000.0
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
}
} else {
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
}
job.EnergyFootprint[fp] = metricEnergy
totalEnergy += metricEnergy
}
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
return err
}
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
cclog.Warn("Error while marshaling job resources")
return err
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
cclog.Warn("Error while marshaling job metadata")
return err
}
return nil
}
// SanityChecks validates job metadata and ensures cluster/subcluster configuration is valid.
//
// This function performs the following validations:
// 1. Verifies the cluster exists in the archive configuration
// 2. Assigns and validates the subcluster (may modify job.SubCluster)
// 3. Validates job state is a recognized value
// 4. Ensures resources and user fields are populated
// 5. Validates node counts and hardware thread counts are positive
// 6. Verifies the number of resources matches the declared node count
//
// The function may modify the job's SubCluster field if it needs to be assigned.
//
// Returns an error if any validation check fails.
func SanityChecks(job *schema.Job) error {
if c := archive.GetCluster(job.Cluster); c == nil {
return fmt.Errorf("no such cluster: %v", job.Cluster)
}
if err := archive.AssignSubCluster(job); err != nil {
log.Warn("Error while assigning subcluster to job")
cclog.Warn("Error while assigning subcluster to job")
return err
}
if !job.State.Valid() {
@@ -150,18 +281,14 @@ func SanityChecks(job *schema.BaseJob) error {
return nil
}
func loadJobStat(job *schema.JobMeta, metric string) float64 {
if stats, ok := job.Statistics[metric]; ok {
if metric == "mem_used" {
return stats.Max
} else {
return stats.Avg
}
}
return 0.0
}
// checkJobData normalizes metric units in job data based on average values.
//
// NOTE: This function is currently unused and contains incomplete implementation.
// It was intended to normalize byte and file-related metrics to appropriate SI prefixes,
// but the normalization logic is commented out. Consider removing or completing this
// function based on project requirements.
//
// TODO: Either implement the metric normalization or remove this dead code.
func checkJobData(d *schema.JobData) error {
for _, scopes := range *d {
// var newUnit schema.Unit

View File

@@ -1,5 +1,5 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
@@ -7,13 +7,27 @@ package importer
import (
"math"
ccunits "github.com/ClusterCockpit/cc-units"
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
)
// getNormalizationFactor calculates the scaling factor needed to normalize a value
// to a more readable range (typically between 1.0 and 1000.0).
//
// For values greater than 1000, the function scales down by factors of 1000 (returns negative exponent).
// For values less than 1.0, the function scales up by factors of 1000 (returns positive exponent).
//
// Returns:
// - factor: The multiplicative factor to apply (10^(count*scale))
// - exponent: The power of 10 representing the adjustment (multiple of 3 for SI prefixes)
func getNormalizationFactor(v float64) (float64, int) {
count := 0
scale := -3
// Prevent infinite loop for zero or negative values
if v <= 0.0 {
return 1.0, 0
}
if v > 1000.0 {
for v > 1000.0 {
v *= 1e-3
@@ -29,9 +43,22 @@ func getNormalizationFactor(v float64) (float64, int) {
return math.Pow10(count * scale), count * scale
}
// getExponent calculates the SI prefix exponent from a numeric prefix value.
//
// For example:
// - Input: 1000.0 (kilo) returns 3
// - Input: 1000000.0 (mega) returns 6
// - Input: 1000000000.0 (giga) returns 9
//
// Returns the exponent representing the power of 10 for the SI prefix.
func getExponent(p float64) int {
count := 0
// Prevent infinite loop for infinity or NaN values
if math.IsInf(p, 0) || math.IsNaN(p) || p <= 0.0 {
return 0
}
for p > 1.0 {
p = p / 1000.0
count++
@@ -40,12 +67,42 @@ func getExponent(p float64) int {
return count * 3
}
// newPrefixFromFactor computes a new SI unit prefix after applying a normalization factor.
//
// Given an original prefix and an exponent adjustment, this function calculates
// the resulting SI prefix. For example, if normalizing from bytes (no prefix) by
// a factor of 10^9, the result would be the "G" (giga) prefix.
//
// Parameters:
// - op: The original SI prefix value
// - e: The exponent adjustment to apply
//
// Returns the new SI prefix after adjustment.
func newPrefixFromFactor(op ccunits.Prefix, e int) ccunits.Prefix {
f := float64(op)
exp := math.Pow10(getExponent(f) - e)
return ccunits.Prefix(exp)
}
// Normalize adjusts a metric value and its SI unit prefix to a more readable range.
//
// This function is useful for automatically scaling metrics to appropriate units.
// For example, normalizing 2048 MiB might result in ~2.0 GiB.
//
// The function analyzes the average value and determines if a different SI prefix
// would make the number more human-readable (typically keeping values between 1 and 1000).
//
// Parameters:
// - avg: The metric value to normalize
// - p: The current SI prefix as a string (e.g., "K", "M", "G")
//
// Returns:
// - factor: The multiplicative factor to apply to convert the value
// - newPrefix: The new SI prefix string to use
//
// Example:
//
// factor, newPrefix := Normalize(2048.0, "M") // returns factor for MB->GB conversion, "G"
func Normalize(avg float64, p string) (float64, string) {
f, e := getNormalizationFactor(avg)

View File

@@ -1,5 +1,5 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
@@ -8,9 +8,11 @@ import (
"fmt"
"testing"
ccunits "github.com/ClusterCockpit/cc-units"
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
)
// TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix.
// Verifies that values in the billions are correctly scaled to the "G" (giga) prefix.
func TestNormalizeFactor(t *testing.T) {
// var us string
s := []float64{2890031237, 23998994567, 389734042344, 390349424345}
@@ -38,6 +40,8 @@ func TestNormalizeFactor(t *testing.T) {
}
}
// TestNormalizeKeep tests that values already in an appropriate range maintain their prefix.
// Verifies that when values don't require rescaling, the original "G" prefix is preserved.
func TestNormalizeKeep(t *testing.T) {
s := []float64{3.0, 24.0, 390.0, 391.0}

File diff suppressed because it is too large Load Diff

View File

@@ -1 +1 @@
{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"exclusive":1,"monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}}
{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"shared":"none","monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}}

View File

@@ -1 +1 @@
{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"exclusive":1,"jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}}
{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"shared":"none","jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}}

232
internal/memorystore/api.go Normal file
View File

@@ -0,0 +1,232 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"errors"
"fmt"
"math"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
)
var (
ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'")
ErrEmptyCluster = errors.New("[METRICSTORE]> cluster name cannot be empty")
)
type APIMetricData struct {
Error *string `json:"error,omitempty"`
Data schema.FloatArray `json:"data,omitempty"`
From int64 `json:"from"`
To int64 `json:"to"`
Resolution int64 `json:"resolution"`
Avg schema.Float `json:"avg"`
Min schema.Float `json:"min"`
Max schema.Float `json:"max"`
}
type APIQueryRequest struct {
Cluster string `json:"cluster"`
Queries []APIQuery `json:"queries"`
ForAllNodes []string `json:"for-all-nodes"`
From int64 `json:"from"`
To int64 `json:"to"`
WithStats bool `json:"with-stats"`
WithData bool `json:"with-data"`
WithPadding bool `json:"with-padding"`
}
type APIQueryResponse struct {
Queries []APIQuery `json:"queries,omitempty"`
Results [][]APIMetricData `json:"results"`
}
type APIQuery struct {
Type *string `json:"type,omitempty"`
SubType *string `json:"subtype,omitempty"`
Metric string `json:"metric"`
Hostname string `json:"host"`
Resolution int64 `json:"resolution"`
TypeIds []string `json:"type-ids,omitempty"`
SubTypeIds []string `json:"subtype-ids,omitempty"`
ScaleFactor schema.Float `json:"scale-by,omitempty"`
Aggregate bool `json:"aggreg"`
}
// TODO: Optimize this, just like the stats endpoint!
func (data *APIMetricData) AddStats() {
n := 0
sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64
for _, x := range data.Data {
if x.IsNaN() {
continue
}
n += 1
sum += float64(x)
min = math.Min(min, float64(x))
max = math.Max(max, float64(x))
}
if n > 0 {
avg := sum / float64(n)
data.Avg = schema.Float(avg)
data.Min = schema.Float(min)
data.Max = schema.Float(max)
} else {
data.Avg, data.Min, data.Max = schema.NaN, schema.NaN, schema.NaN
}
}
func (data *APIMetricData) ScaleBy(f schema.Float) {
if f == 0 || f == 1 {
return
}
data.Avg *= f
data.Min *= f
data.Max *= f
for i := 0; i < len(data.Data); i++ {
data.Data[i] *= f
}
}
func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) {
minfo, ok := ms.Metrics[metric]
if !ok {
return
}
if (data.From / minfo.Frequency) > (from / minfo.Frequency) {
padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency))
ndata := make([]schema.Float, 0, padfront+len(data.Data))
for range padfront {
ndata = append(ndata, schema.NaN)
}
for j := 0; j < len(data.Data); j++ {
ndata = append(ndata, data.Data[j])
}
data.Data = ndata
}
}
func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
if req.From > req.To {
return nil, ErrInvalidTimeRange
}
if req.Cluster == "" && req.ForAllNodes != nil {
return nil, ErrEmptyCluster
}
req.WithData = true
ms := GetMemoryStore()
if ms == nil {
return nil, fmt.Errorf("memorystore not initialized")
}
response := APIQueryResponse{
Results: make([][]APIMetricData, 0, len(req.Queries)),
}
if req.ForAllNodes != nil {
nodes := ms.ListChildren([]string{req.Cluster})
for _, node := range nodes {
for _, metric := range req.ForAllNodes {
q := APIQuery{
Metric: metric,
Hostname: node,
}
req.Queries = append(req.Queries, q)
response.Queries = append(response.Queries, q)
}
}
}
for _, query := range req.Queries {
sels := make([]util.Selector, 0, 1)
if query.Aggregate || query.Type == nil {
sel := util.Selector{{String: req.Cluster}, {String: query.Hostname}}
if query.Type != nil {
if len(query.TypeIds) == 1 {
sel = append(sel, util.SelectorElement{String: *query.Type + query.TypeIds[0]})
} else {
ids := make([]string, len(query.TypeIds))
for i, id := range query.TypeIds {
ids[i] = *query.Type + id
}
sel = append(sel, util.SelectorElement{Group: ids})
}
if query.SubType != nil {
if len(query.SubTypeIds) == 1 {
sel = append(sel, util.SelectorElement{String: *query.SubType + query.SubTypeIds[0]})
} else {
ids := make([]string, len(query.SubTypeIds))
for i, id := range query.SubTypeIds {
ids[i] = *query.SubType + id
}
sel = append(sel, util.SelectorElement{Group: ids})
}
}
}
sels = append(sels, sel)
} else {
for _, typeID := range query.TypeIds {
if query.SubType != nil {
for _, subTypeID := range query.SubTypeIds {
sels = append(sels, util.Selector{
{String: req.Cluster},
{String: query.Hostname},
{String: *query.Type + typeID},
{String: *query.SubType + subTypeID},
})
}
} else {
sels = append(sels, util.Selector{
{String: req.Cluster},
{String: query.Hostname},
{String: *query.Type + typeID},
})
}
}
}
// log.Printf("query: %#v\n", query)
// log.Printf("sels: %#v\n", sels)
var err error
res := make([]APIMetricData, 0, len(sels))
for _, sel := range sels {
data := APIMetricData{}
data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution)
if err != nil {
msg := err.Error()
data.Error = &msg
res = append(res, data)
continue
}
if req.WithStats {
data.AddStats()
}
if query.ScaleFactor != 0 {
data.ScaleBy(query.ScaleFactor)
}
if req.WithPadding {
data.PadDataWithNull(ms, req.From, req.To, query.Metric)
}
if !req.WithData {
data.Data = nil
}
res = append(res, data)
}
response.Results = append(response.Results, res)
}
return &response, nil
}

View File

@@ -0,0 +1,196 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"archive/zip"
"bufio"
"context"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"sync"
"sync/atomic"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
)
func Archiving(wg *sync.WaitGroup, ctx context.Context) {
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.Archive.Interval)
if err != nil {
cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err)
}
if d <= 0 {
return
}
ticker := time.NewTicker(d)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
t := time.Now().Add(-d)
cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...", t.Format(time.RFC3339))
n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir,
Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead)
if err != nil {
cclog.Errorf("[METRICSTORE]> archiving failed: %s", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
}
}
}
}()
}
var ErrNoNewArchiveData error = errors.New("all data already archived")
// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`,
// deleting them from the `checkpointsDir`.
func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) {
entries1, err := os.ReadDir(checkpointsDir)
if err != nil {
return 0, err
}
type workItem struct {
cdir, adir string
cluster, host string
}
var wg sync.WaitGroup
n, errs := int32(0), int32(0)
work := make(chan workItem, Keys.NumWorkers)
wg.Add(Keys.NumWorkers)
for worker := 0; worker < Keys.NumWorkers; worker++ {
go func() {
defer wg.Done()
for workItem := range work {
m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
if err != nil {
cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
atomic.AddInt32(&errs, 1)
}
atomic.AddInt32(&n, int32(m))
}
}()
}
for _, de1 := range entries1 {
entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
if e != nil {
err = e
}
for _, de2 := range entries2 {
cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
adir := filepath.Join(archiveDir, de1.Name(), de2.Name())
work <- workItem{
adir: adir, cdir: cdir,
cluster: de1.Name(), host: de2.Name(),
}
}
}
close(work)
wg.Wait()
if err != nil {
return int(n), err
}
if errs > 0 {
return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n)
}
return int(n), nil
}
// Helper function for `ArchiveCheckpoints`.
func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) {
entries, err := os.ReadDir(dir)
if err != nil {
return 0, err
}
extension := Keys.Checkpoints.FileFormat
files, err := findFiles(entries, from, extension, false)
if err != nil {
return 0, err
}
if deleteInstead {
n := 0
for _, checkpoint := range files {
filename := filepath.Join(dir, checkpoint)
if err = os.Remove(filename); err != nil {
return n, err
}
n += 1
}
return n, nil
}
filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from))
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(archiveDir, CheckpointDirPerms)
if err == nil {
f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
}
}
if err != nil {
return 0, err
}
defer f.Close()
bw := bufio.NewWriter(f)
defer bw.Flush()
zw := zip.NewWriter(bw)
defer zw.Close()
n := 0
for _, checkpoint := range files {
// Use closure to ensure file is closed immediately after use,
// avoiding file descriptor leak from defer in loop
err := func() error {
filename := filepath.Join(dir, checkpoint)
r, err := os.Open(filename)
if err != nil {
return err
}
defer r.Close()
w, err := zw.Create(checkpoint)
if err != nil {
return err
}
if _, err = io.Copy(w, r); err != nil {
return err
}
if err = os.Remove(filename); err != nil {
return err
}
return nil
}()
if err != nil {
return n, err
}
n += 1
}
return n, nil
}

View File

@@ -0,0 +1,477 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"os"
"path"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/linkedin/goavro/v2"
)
var NumAvroWorkers int = DefaultAvroWorkers
var startUp bool = true
func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
levels := make([]*AvroLevel, 0)
selectors := make([][]string, 0)
as.root.lock.RLock()
// Cluster
for sel1, l1 := range as.root.children {
l1.lock.RLock()
// Node
for sel2, l2 := range l1.children {
l2.lock.RLock()
// Frequency
for sel3, l3 := range l2.children {
levels = append(levels, l3)
selectors = append(selectors, []string{sel1, sel2, sel3})
}
l2.lock.RUnlock()
}
l1.lock.RUnlock()
}
as.root.lock.RUnlock()
type workItem struct {
level *AvroLevel
dir string
selector []string
}
n, errs := int32(0), int32(0)
var wg sync.WaitGroup
wg.Add(NumAvroWorkers)
work := make(chan workItem, NumAvroWorkers*2)
for range NumAvroWorkers {
go func() {
defer wg.Done()
for workItem := range work {
from := getTimestamp(workItem.dir)
if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil {
if err == ErrNoNewArchiveData {
continue
}
cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error())
atomic.AddInt32(&errs, 1)
} else {
atomic.AddInt32(&n, 1)
}
}
}()
}
for i := range len(levels) {
dir := path.Join(dir, path.Join(selectors[i]...))
work <- workItem{
level: levels[i],
dir: dir,
selector: selectors[i],
}
}
close(work)
wg.Wait()
if errs > 0 {
return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n)
}
startUp = false
return int(n), nil
}
// getTimestamp returns the timestamp from the directory name
func getTimestamp(dir string) int64 {
// Extract the resolution and timestamp from the directory name
// The existing avro file will be in epoch timestamp format
// iterate over all the files in the directory and find the maximum timestamp
// and return it
resolution := path.Base(dir)
dir = path.Dir(dir)
files, err := os.ReadDir(dir)
if err != nil {
return 0
}
var maxTS int64 = 0
if len(files) == 0 {
return 0
}
for _, file := range files {
if file.IsDir() {
continue
}
name := file.Name()
if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") {
continue
}
ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64)
if err != nil {
fmt.Printf("error while parsing timestamp: %s\n", err.Error())
continue
}
if ts > maxTS {
maxTS = ts
}
}
interval, _ := time.ParseDuration(Keys.Checkpoints.Interval)
updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix()
if startUp {
return 0
}
if updateTime < time.Now().Unix() {
return 0
}
return maxTS
}
func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error {
l.lock.Lock()
defer l.lock.Unlock()
// fmt.Printf("Checkpointing directory: %s\n", dir)
// filepath contains the resolution
intRes, _ := strconv.Atoi(path.Base(dir))
// find smallest overall timestamp in l.data map and delete it from l.data
minTS := int64(1<<63 - 1)
for ts, dat := range l.data {
if ts < minTS && len(dat) != 0 {
minTS = ts
}
}
if from == 0 && minTS != int64(1<<63-1) {
from = minTS
}
if from == 0 {
return ErrNoNewArchiveData
}
var schema string
var codec *goavro.Codec
recordList := make([]map[string]any, 0)
var f *os.File
filePath := dir + fmt.Sprintf("_%d.avro", from)
var err error
fp_, err_ := os.Stat(filePath)
if errors.Is(err_, os.ErrNotExist) {
err = os.MkdirAll(path.Dir(dir), 0o755)
if err != nil {
return fmt.Errorf("failed to create directory: %v", err)
}
} else if fp_.Size() != 0 {
f, err = os.Open(filePath)
if err != nil {
return fmt.Errorf("failed to open existing avro file: %v", err)
}
br := bufio.NewReader(f)
reader, err := goavro.NewOCFReader(br)
if err != nil {
return fmt.Errorf("failed to create OCF reader: %v", err)
}
codec = reader.Codec()
schema = codec.Schema()
f.Close()
}
timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix()
if dumpAll {
timeRef = time.Now().Unix()
}
// Empty values
if len(l.data) == 0 {
// we checkpoint avro files every 60 seconds
repeat := 60 / intRes
for range repeat {
recordList = append(recordList, make(map[string]any))
}
}
readFlag := true
for ts := range l.data {
flag := false
if ts < timeRef {
data := l.data[ts]
schemaGen, err := generateSchema(data)
if err != nil {
return err
}
flag, schema, err = compareSchema(schema, schemaGen)
if err != nil {
return fmt.Errorf("failed to compare read and generated schema: %v", err)
}
if flag && readFlag && !errors.Is(err_, os.ErrNotExist) {
f.Close()
f, err = os.Open(filePath)
if err != nil {
return fmt.Errorf("failed to open Avro file: %v", err)
}
br := bufio.NewReader(f)
ocfReader, err := goavro.NewOCFReader(br)
if err != nil {
return fmt.Errorf("failed to create OCF reader while changing schema: %v", err)
}
for ocfReader.Scan() {
record, err := ocfReader.Read()
if err != nil {
return fmt.Errorf("failed to read record: %v", err)
}
recordList = append(recordList, record.(map[string]any))
}
f.Close()
err = os.Remove(filePath)
if err != nil {
return fmt.Errorf("failed to delete file: %v", err)
}
readFlag = false
}
codec, err = goavro.NewCodec(schema)
if err != nil {
return fmt.Errorf("failed to create codec after merged schema: %v", err)
}
recordList = append(recordList, generateRecord(data))
delete(l.data, ts)
}
}
if len(recordList) == 0 {
return ErrNoNewArchiveData
}
f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644)
if err != nil {
return fmt.Errorf("failed to append new avro file: %v", err)
}
// fmt.Printf("Codec : %#v\n", codec)
writer, err := goavro.NewOCFWriter(goavro.OCFConfig{
W: f,
Codec: codec,
CompressionName: goavro.CompressionDeflateLabel,
})
if err != nil {
return fmt.Errorf("failed to create OCF writer: %v", err)
}
// Append the new record
if err := writer.Append(recordList); err != nil {
return fmt.Errorf("failed to append record: %v", err)
}
f.Close()
return nil
}
func compareSchema(schemaRead, schemaGen string) (bool, string, error) {
var genSchema, readSchema AvroSchema
if schemaRead == "" {
return false, schemaGen, nil
}
// Unmarshal the schema strings into AvroSchema structs
if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil {
return false, "", fmt.Errorf("failed to parse generated schema: %v", err)
}
if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil {
return false, "", fmt.Errorf("failed to parse read schema: %v", err)
}
sort.Slice(genSchema.Fields, func(i, j int) bool {
return genSchema.Fields[i].Name < genSchema.Fields[j].Name
})
sort.Slice(readSchema.Fields, func(i, j int) bool {
return readSchema.Fields[i].Name < readSchema.Fields[j].Name
})
// Check if schemas are identical
schemasEqual := true
if len(genSchema.Fields) <= len(readSchema.Fields) {
for i := range genSchema.Fields {
if genSchema.Fields[i].Name != readSchema.Fields[i].Name {
schemasEqual = false
break
}
}
// If schemas are identical, return the read schema
if schemasEqual {
return false, schemaRead, nil
}
}
// Create a map to hold unique fields from both schemas
fieldMap := make(map[string]AvroField)
// Add fields from the read schema
for _, field := range readSchema.Fields {
fieldMap[field.Name] = field
}
// Add or update fields from the generated schema
for _, field := range genSchema.Fields {
fieldMap[field.Name] = field
}
// Create a union schema by collecting fields from the map
var mergedFields []AvroField
for _, field := range fieldMap {
mergedFields = append(mergedFields, field)
}
// Sort fields by name for consistency
sort.Slice(mergedFields, func(i, j int) bool {
return mergedFields[i].Name < mergedFields[j].Name
})
// Create the merged schema
mergedSchema := AvroSchema{
Type: "record",
Name: genSchema.Name,
Fields: mergedFields,
}
// Check if schemas are identical
schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields)
if schemasEqual {
for i := range mergedSchema.Fields {
if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name {
schemasEqual = false
break
}
}
if schemasEqual {
return false, schemaRead, nil
}
}
// Marshal the merged schema back to JSON
mergedSchemaJSON, err := json.Marshal(mergedSchema)
if err != nil {
return false, "", fmt.Errorf("failed to marshal merged schema: %v", err)
}
return true, string(mergedSchemaJSON), nil
}
func generateSchema(data map[string]schema.Float) (string, error) {
// Define the Avro schema structure
schema := map[string]any{
"type": "record",
"name": "DataRecord",
"fields": []map[string]any{},
}
fieldTracker := make(map[string]struct{})
for key := range data {
if _, exists := fieldTracker[key]; !exists {
key = correctKey(key)
field := map[string]any{
"name": key,
"type": "double",
"default": -1.0,
}
schema["fields"] = append(schema["fields"].([]map[string]any), field)
fieldTracker[key] = struct{}{}
}
}
schemaString, err := json.Marshal(schema)
if err != nil {
return "", fmt.Errorf("failed to marshal schema: %v", err)
}
return string(schemaString), nil
}
func generateRecord(data map[string]schema.Float) map[string]any {
record := make(map[string]any)
// Iterate through each map in data
for key, value := range data {
key = correctKey(key)
// Set the value in the record
// avro only accepts basic types
record[key] = value.Double()
}
return record
}
func correctKey(key string) string {
key = strings.ReplaceAll(key, "_", "_0x5F_")
key = strings.ReplaceAll(key, ":", "_0x3A_")
key = strings.ReplaceAll(key, ".", "_0x2E_")
return key
}
func ReplaceKey(key string) string {
key = strings.ReplaceAll(key, "_0x2E_", ".")
key = strings.ReplaceAll(key, "_0x3A_", ":")
key = strings.ReplaceAll(key, "_0x5F_", "_")
return key
}

View File

@@ -0,0 +1,84 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"context"
"slices"
"strconv"
"sync"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
)
func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
// AvroPool is a pool of Avro writers.
go func() {
if Keys.Checkpoints.FileFormat == "json" {
wg.Done() // Mark this goroutine as done
return // Exit the goroutine
}
defer wg.Done()
var avroLevel *AvroLevel
oldSelector := make([]string, 0)
for {
select {
case <-ctx.Done():
return
case val := <-LineProtocolMessages:
// Fetch the frequency of the metric from the global configuration
freq, err := GetMetricFrequency(val.MetricName)
if err != nil {
cclog.Errorf("Error fetching metric frequency: %s\n", err)
continue
}
metricName := ""
for _, selectorName := range val.Selector {
metricName += selectorName + SelectorDelimiter
}
metricName += val.MetricName
// Create a new selector for the Avro level
// The selector is a slice of strings that represents the path to the
// Avro level. It is created by appending the cluster, node, and metric
// name to the selector.
var selector []string
selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
if !stringSlicesEqual(oldSelector, selector) {
// Get the Avro level for the metric
avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
// If the Avro level is nil, create a new one
if avroLevel == nil {
cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
}
oldSelector = slices.Clone(selector)
}
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
}
}
}()
}
func stringSlicesEqual(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}

View File

@@ -0,0 +1,167 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"sync"
"github.com/ClusterCockpit/cc-lib/schema"
)
var (
LineProtocolMessages = make(chan *AvroStruct)
// SelectorDelimiter separates hierarchical selector components in metric names for Avro encoding
SelectorDelimiter = "_SEL_"
)
var CheckpointBufferMinutes = DefaultCheckpointBufferMin
type AvroStruct struct {
MetricName string
Cluster string
Node string
Selector []string
Value schema.Float
Timestamp int64
}
type AvroStore struct {
root AvroLevel
}
var avroStore AvroStore
type AvroLevel struct {
children map[string]*AvroLevel
data map[int64]map[string]schema.Float
lock sync.RWMutex
}
type AvroField struct {
Name string `json:"name"`
Type any `json:"type"`
Default any `json:"default,omitempty"`
}
type AvroSchema struct {
Type string `json:"type"`
Name string `json:"name"`
Fields []AvroField `json:"fields"`
}
func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *AvroLevel
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok := l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findAvroLevelOrCreate(selector[1:])
}
}
// The level does not exist, take write lock for unique access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findAvroLevelOrCreate(selector[1:])
}
}
child = &AvroLevel{
data: make(map[int64]map[string]schema.Float, 0),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*AvroLevel{selector[0]: child}
}
l.lock.Unlock()
return child.findAvroLevelOrCreate(selector[1:])
}
func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) {
l.lock.Lock()
defer l.lock.Unlock()
KeyCounter := int(CheckpointBufferMinutes * 60 / Freq)
// Create keys in advance for the given amount of time
if len(l.data) != KeyCounter {
if len(l.data) == 0 {
for i := range KeyCounter {
l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0)
}
} else {
// Get the last timestamp
var lastTS int64
for ts := range l.data {
if ts > lastTS {
lastTS = ts
}
}
// Create keys for the next KeyCounter timestamps
l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0)
}
}
closestTS := int64(0)
minDiff := int64(Freq) + 1 // Start with diff just outside the valid range
found := false
// Iterate over timestamps and choose the one which is within range.
// Since its epoch time, we check if the difference is less than 60 seconds.
for ts, dat := range l.data {
// Check if timestamp is within range
diff := timestamp - ts
if diff < -int64(Freq) || diff > int64(Freq) {
continue
}
// Metric already present at this timestamp — skip
if _, ok := dat[metricName]; ok {
continue
}
// Check if this is the closest timestamp so far
if Abs(diff) < minDiff {
minDiff = Abs(diff)
closestTS = ts
found = true
}
}
if found {
l.data[closestTS][metricName] = value
}
}
func GetAvroStore() *AvroStore {
return &avroStore
}
// Abs returns the absolute value of x.
func Abs(x int64) int64 {
if x < 0 {
return -x
}
return x
}

View File

@@ -0,0 +1,190 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"errors"
"sync"
"github.com/ClusterCockpit/cc-lib/schema"
)
// BufferCap is the default buffer capacity.
// buffer.data will only ever grow up to its capacity and a new link
// in the buffer chain will be created if needed so that no copying
// of data or reallocation needs to happen on writes.
const BufferCap int = DefaultBufferCapacity
var bufferPool sync.Pool = sync.Pool{
New: func() any {
return &buffer{
data: make([]schema.Float, 0, BufferCap),
}
},
}
var (
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
)
// Each metric on each level has it's own buffer.
// This is where the actual values go.
// If `cap(data)` is reached, a new buffer is created and
// becomes the new head of a buffer list.
type buffer struct {
prev *buffer
next *buffer
data []schema.Float
frequency int64
start int64
archived bool
closed bool
}
func newBuffer(ts, freq int64) *buffer {
b := bufferPool.Get().(*buffer)
b.frequency = freq
b.start = ts - (freq / 2)
b.prev = nil
b.next = nil
b.archived = false
b.closed = false
b.data = b.data[:0]
return b
}
// If a new buffer was created, the new head is returnd.
// Otherwise, the existing buffer is returnd.
// Normaly, only "newer" data should be written, but if the value would
// end up in the same buffer anyways it is allowed.
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
if ts < b.start {
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
}
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
idx := int((ts - b.start) / b.frequency)
if idx >= cap(b.data) {
newbuf := newBuffer(ts, b.frequency)
newbuf.prev = b
b.next = newbuf
b = newbuf
idx = 0
}
// Overwriting value or writing value from past
if idx < len(b.data) {
b.data[idx] = value
return b, nil
}
// Fill up unwritten slots with NaN
for i := len(b.data); i < idx; i++ {
b.data = append(b.data, schema.NaN)
}
b.data = append(b.data, value)
return b, nil
}
func (b *buffer) end() int64 {
return b.firstWrite() + int64(len(b.data))*b.frequency
}
func (b *buffer) firstWrite() int64 {
return b.start + (b.frequency / 2)
}
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
// Simple linear interpolation is done between the two neighboring cells if possible.
// If values at the start or end are missing, instead of NaN values, the second and thrid
// return values contain the actual `from`/`to`.
// This function goes back the buffer chain if `from` is older than the currents buffer start.
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
// If `data` is not long enough to hold all values, this function will panic!
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
if from < b.firstWrite() {
if b.prev != nil {
return b.prev.read(from, to, data)
}
from = b.firstWrite()
}
i := 0
t := from
for ; t < to; t += b.frequency {
idx := int((t - b.start) / b.frequency)
if idx >= cap(b.data) {
if b.next == nil {
break
}
b = b.next
idx = 0
}
if idx >= len(b.data) {
if b.next == nil || to <= b.next.start {
break
}
data[i] += schema.NaN
} else if t < b.start {
data[i] += schema.NaN
} else {
data[i] += b.data[idx]
}
i++
}
return data[:i], from, t, nil
}
// Returns true if this buffer needs to be freed.
func (b *buffer) free(t int64) (delme bool, n int) {
if b.prev != nil {
delme, m := b.prev.free(t)
n += m
if delme {
b.prev.next = nil
if cap(b.prev.data) == BufferCap {
bufferPool.Put(b.prev)
}
b.prev = nil
}
}
end := b.end()
if end < t {
return true, n + 1
}
return false, n
}
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
if b == nil {
return nil
}
if err := b.prev.iterFromTo(from, to, callback); err != nil {
return err
}
if from <= b.end() && b.start <= to {
return callback(b)
}
return nil
}
func (b *buffer) count() int64 {
res := int64(len(b.data))
if b.prev != nil {
res += b.prev.count()
}
return res
}

View File

@@ -0,0 +1,761 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"io/fs"
"os"
"path"
"path/filepath"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/linkedin/goavro/v2"
)
const (
CheckpointFilePerms = 0o644
CheckpointDirPerms = 0o755
GCTriggerInterval = DefaultGCTriggerInterval
)
// Whenever changed, update MarshalJSON as well!
type CheckpointMetrics struct {
Data []schema.Float `json:"data"`
Frequency int64 `json:"frequency"`
Start int64 `json:"start"`
}
type CheckpointFile struct {
Metrics map[string]*CheckpointMetrics `json:"metrics"`
Children map[string]*CheckpointFile `json:"children"`
From int64 `json:"from"`
To int64 `json:"to"`
}
var lastCheckpoint time.Time
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
lastCheckpoint = time.Now()
if Keys.Checkpoints.FileFormat == "json" {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.Checkpoints.Interval)
if err != nil {
cclog.Fatal(err)
}
if d <= 0 {
return
}
ticker := time.NewTicker(d)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", lastCheckpoint.Format(time.RFC3339))
now := time.Now()
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
lastCheckpoint.Unix(), now.Unix())
if err != nil {
cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n)
lastCheckpoint = now
}
}
}
}()
} else {
go func() {
defer wg.Done()
select {
case <-ctx.Done():
return
case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute):
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
}
ticker := time.NewTicker(DefaultAvroCheckpointInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
}
}
}()
}
}
// As `Float` implements a custom MarshalJSON() function,
// serializing an array of such types has more overhead
// than one would assume (because of extra allocations, interfaces and so on).
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 128+len(cm.Data)*8)
buf = append(buf, `{"frequency":`...)
buf = strconv.AppendInt(buf, cm.Frequency, 10)
buf = append(buf, `,"start":`...)
buf = strconv.AppendInt(buf, cm.Start, 10)
buf = append(buf, `,"data":[`...)
for i, x := range cm.Data {
if i != 0 {
buf = append(buf, ',')
}
if x.IsNaN() {
buf = append(buf, `null`...)
} else {
buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32)
}
}
buf = append(buf, `]}`...)
return buf, nil
}
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
// The good thing: Only a host at a time is locked, so this function can run
// in parallel to writes/reads.
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
levels := make([]*Level, 0)
selectors := make([][]string, 0)
m.root.lock.RLock()
for sel1, l1 := range m.root.children {
l1.lock.RLock()
for sel2, l2 := range l1.children {
levels = append(levels, l2)
selectors = append(selectors, []string{sel1, sel2})
}
l1.lock.RUnlock()
}
m.root.lock.RUnlock()
type workItem struct {
level *Level
dir string
selector []string
}
n, errs := int32(0), int32(0)
var wg sync.WaitGroup
wg.Add(Keys.NumWorkers)
work := make(chan workItem, Keys.NumWorkers*2)
for worker := 0; worker < Keys.NumWorkers; worker++ {
go func() {
defer wg.Done()
for workItem := range work {
if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
if err == ErrNoNewArchiveData {
continue
}
cclog.Errorf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error())
atomic.AddInt32(&errs, 1)
} else {
atomic.AddInt32(&n, 1)
}
}
}()
}
for i := 0; i < len(levels); i++ {
dir := path.Join(dir, path.Join(selectors[i]...))
work <- workItem{
level: levels[i],
dir: dir,
selector: selectors[i],
}
}
close(work)
wg.Wait()
if errs > 0 {
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
}
return int(n), nil
}
func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
l.lock.RLock()
defer l.lock.RUnlock()
retval := &CheckpointFile{
From: from,
To: to,
Metrics: make(map[string]*CheckpointMetrics),
Children: make(map[string]*CheckpointFile),
}
for metric, minfo := range m.Metrics {
b := l.metrics[minfo.offset]
if b == nil {
continue
}
allArchived := true
b.iterFromTo(from, to, func(b *buffer) error {
if !b.archived {
allArchived = false
}
return nil
})
if allArchived {
continue
}
data := make([]schema.Float, (to-from)/b.frequency+1)
data, start, end, err := b.read(from, to, data)
if err != nil {
return nil, err
}
for i := int((end - start) / b.frequency); i < len(data); i++ {
data[i] = schema.NaN
}
retval.Metrics[metric] = &CheckpointMetrics{
Frequency: b.frequency,
Start: start,
Data: data,
}
}
for name, child := range l.children {
val, err := child.toCheckpointFile(from, to, m)
if err != nil {
return nil, err
}
if val != nil {
retval.Children[name] = val
}
}
if len(retval.Children) == 0 && len(retval.Metrics) == 0 {
return nil, nil
}
return retval, nil
}
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
cf, err := l.toCheckpointFile(from, to, m)
if err != nil {
return err
}
if cf == nil {
return ErrNoNewArchiveData
}
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(dir, CheckpointDirPerms)
if err == nil {
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
}
}
if err != nil {
return err
}
defer f.Close()
bw := bufio.NewWriter(f)
if err = json.NewEncoder(bw).Encode(cf); err != nil {
return err
}
return bw.Flush()
}
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
var wg sync.WaitGroup
work := make(chan [2]string, Keys.NumWorkers)
n, errs := int32(0), int32(0)
wg.Add(Keys.NumWorkers)
for worker := 0; worker < Keys.NumWorkers; worker++ {
go func() {
defer wg.Done()
for host := range work {
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
if err != nil {
cclog.Errorf("[METRICSTORE]> error while loading checkpoints for %s/%s: %s", host[0], host[1], err.Error())
atomic.AddInt32(&errs, 1)
}
atomic.AddInt32(&n, int32(nn))
}
}()
}
i := 0
clustersDir, err := os.ReadDir(dir)
for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
goto done
}
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if e != nil {
err = e
goto done
}
for _, hostDir := range hostsDir {
if !hostDir.IsDir() {
err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
goto done
}
i++
if i%Keys.NumWorkers == 0 && i > GCTriggerInterval {
// Forcing garbage collection runs here regulary during the loading of checkpoints
// will decrease the total heap size after loading everything back to memory is done.
// While loading data, the heap will grow fast, so the GC target size will double
// almost always. By forcing GCs here, we can keep it growing more slowly so that
// at the end, less memory is wasted.
runtime.GC()
}
work <- [2]string{clusterDir.Name(), hostDir.Name()}
}
}
done:
close(work)
wg.Wait()
if err != nil {
return int(n), err
}
if errs > 0 {
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
}
return int(n), nil
}
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
// This function can only be called once and before the very first write or read.
// Different host's data is loaded to memory in parallel.
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
if _, err := os.Stat(dir); os.IsNotExist(err) {
// The directory does not exist, so create it using os.MkdirAll()
err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory
if err != nil {
cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
}
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
}
// Config read (replace with your actual config read)
fileFormat := Keys.Checkpoints.FileFormat
if fileFormat == "" {
fileFormat = "avro"
}
// Map to easily get the fallback format
oppositeFormat := map[string]string{
"json": "avro",
"avro": "json",
}
// First, attempt to load the specified format
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
} else if found {
cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
return m.FromCheckpoint(dir, from, fileFormat)
}
// If not found, attempt the opposite format
altFormat := oppositeFormat[fileFormat]
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
} else if found {
cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
return m.FromCheckpoint(dir, from, altFormat)
}
cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory")
return 0, nil
}
func checkFilesWithExtension(dir string, extension string) (bool, error) {
found := false
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
}
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
found = true
return nil
}
return nil
})
if err != nil {
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
}
return found, nil
}
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
br := bufio.NewReader(f)
fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:]
resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err)
}
fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64)
// Same logic according to lineprotocol
fromTimestamp -= (resolution / 2)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err)
}
// fmt.Printf("File : %s with resolution : %d\n", fileName, resolution)
var recordCounter int64 = 0
// Create a new OCF reader from the buffered reader
ocfReader, err := goavro.NewOCFReader(br)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error creating OCF reader: %w", err)
}
metricsData := make(map[string]schema.FloatArray)
for ocfReader.Scan() {
datum, err := ocfReader.Read()
if err != nil {
return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err)
}
record, ok := datum.(map[string]any)
if !ok {
return fmt.Errorf("[METRICSTORE]> failed to assert datum as map[string]interface{}")
}
for key, value := range record {
metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64)))
}
recordCounter += 1
}
to := (fromTimestamp + (recordCounter / (60 / resolution) * 60))
if to < from {
return nil
}
for key, floatArray := range metricsData {
metricName := ReplaceKey(key)
if strings.Contains(metricName, SelectorDelimiter) {
subString := strings.Split(metricName, SelectorDelimiter)
lvl := l
for i := 0; i < len(subString)-1; i++ {
sel := subString[i]
if lvl.children == nil {
lvl.children = make(map[string]*Level)
}
child, ok := lvl.children[sel]
if !ok {
child = &Level{
metrics: make([]*buffer, len(m.Metrics)),
children: nil,
}
lvl.children[sel] = child
}
lvl = child
}
leafMetricName := subString[len(subString)-1]
err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
}
} else {
err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
}
}
}
return nil
}
func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error {
n := len(floatArray)
b := &buffer{
frequency: resolution,
start: from,
data: floatArray[0:n:n],
prev: nil,
next: nil,
archived: true,
}
minfo, ok := m.Metrics[metricName]
if !ok {
return nil
}
prev := l.metrics[minfo.offset]
if prev == nil {
l.metrics[minfo.offset] = b
} else {
if prev.start > b.start {
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
}
b.prev = prev
prev.next = b
missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency))
if missingCount > 0 {
missingCount /= int(b.frequency)
for range missingCount {
prev.data = append(prev.data, schema.NaN)
}
prev.data = prev.data[0:len(prev.data):len(prev.data)]
}
}
l.metrics[minfo.offset] = b
return nil
}
func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
br := bufio.NewReader(f)
cf := &CheckpointFile{}
if err := json.NewDecoder(br).Decode(cf); err != nil {
return err
}
if cf.To != 0 && cf.To < from {
return nil
}
if err := l.loadFile(cf, m); err != nil {
return err
}
return nil
}
func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
for name, metric := range cf.Metrics {
n := len(metric.Data)
b := &buffer{
frequency: metric.Frequency,
start: metric.Start,
data: metric.Data[0:n:n],
prev: nil,
next: nil,
archived: true,
}
minfo, ok := m.Metrics[name]
if !ok {
continue
}
prev := l.metrics[minfo.offset]
if prev == nil {
l.metrics[minfo.offset] = b
} else {
if prev.start > b.start {
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
}
b.prev = prev
prev.next = b
}
l.metrics[minfo.offset] = b
}
if len(cf.Children) > 0 && l.children == nil {
l.children = make(map[string]*Level)
}
for sel, childCf := range cf.Children {
child, ok := l.children[sel]
if !ok {
child = &Level{
metrics: make([]*buffer, len(m.Metrics)),
children: nil,
}
l.children[sel] = child
}
if err := child.loadFile(childCf, m); err != nil {
return err
}
}
return nil
}
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
direntries, err := os.ReadDir(dir)
if err != nil {
if os.IsNotExist(err) {
return 0, nil
}
return 0, err
}
allFiles := make([]fs.DirEntry, 0)
filesLoaded := 0
for _, e := range direntries {
if e.IsDir() {
child := &Level{
metrics: make([]*buffer, len(m.Metrics)),
children: make(map[string]*Level),
}
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
filesLoaded += files
if err != nil {
return filesLoaded, err
}
l.children[e.Name()] = child
} else if strings.HasSuffix(e.Name(), "."+extension) {
allFiles = append(allFiles, e)
} else {
continue
}
}
files, err := findFiles(allFiles, from, extension, true)
if err != nil {
return filesLoaded, err
}
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
"json": l.loadJSONFile,
"avro": l.loadAvroFile,
}
loader := loaders[extension]
for _, filename := range files {
// Use a closure to ensure file is closed immediately after use
err := func() error {
f, err := os.Open(path.Join(dir, filename))
if err != nil {
return err
}
defer f.Close()
return loader(m, f, from)
}()
if err != nil {
return filesLoaded, err
}
filesLoaded += 1
}
return filesLoaded, nil
}
// This will probably get very slow over time!
// A solution could be some sort of an index file in which all other files
// and the timespan they contain is listed.
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
nums := map[string]int64{}
for _, e := range direntries {
if !strings.HasSuffix(e.Name(), "."+extension) {
continue
}
ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64)
if err != nil {
return nil, err
}
nums[e.Name()] = ts
}
sort.Slice(direntries, func(i, j int) bool {
a, b := direntries[i], direntries[j]
return nums[a.Name()] < nums[b.Name()]
})
filenames := make([]string, 0)
for i := range direntries {
e := direntries[i]
ts1 := nums[e.Name()]
if findMoreRecentFiles && t <= ts1 {
filenames = append(filenames, e.Name())
}
if i == len(direntries)-1 {
continue
}
enext := direntries[i+1]
ts2 := nums[enext.Name()]
if findMoreRecentFiles {
if ts1 < t && t < ts2 {
filenames = append(filenames, e.Name())
}
} else {
if ts2 < t {
filenames = append(filenames, e.Name())
}
}
}
return filenames, nil
}

View File

@@ -0,0 +1,115 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"fmt"
"time"
)
const (
DefaultMaxWorkers = 10
DefaultBufferCapacity = 512
DefaultGCTriggerInterval = 100
DefaultAvroWorkers = 4
DefaultCheckpointBufferMin = 3
DefaultAvroCheckpointInterval = time.Minute
)
type MetricStoreConfig struct {
// Number of concurrent workers for checkpoint and archive operations.
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
NumWorkers int `json:"num-workers"`
Checkpoints struct {
FileFormat string `json:"file-format"`
Interval string `json:"interval"`
RootDir string `json:"directory"`
Restore string `json:"restore"`
} `json:"checkpoints"`
Debug struct {
DumpToFile string `json:"dump-to-file"`
EnableGops bool `json:"gops"`
} `json:"debug"`
RetentionInMemory string `json:"retention-in-memory"`
Archive struct {
Interval string `json:"interval"`
RootDir string `json:"directory"`
DeleteInstead bool `json:"delete-instead"`
} `json:"archive"`
Subscriptions []struct {
// Channel name
SubscribeTo string `json:"subscribe-to"`
// Allow lines without a cluster tag, use this as default, optional
ClusterTag string `json:"cluster-tag"`
} `json:"subscriptions"`
}
var Keys MetricStoreConfig
// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time!
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota
SumAggregation
AvgAggregation
)
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
switch str {
case "":
return NoAggregation, nil
case "sum":
return SumAggregation, nil
case "avg":
return AvgAggregation, nil
default:
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
}
}
type MetricConfig struct {
// Interval in seconds at which measurements are stored
Frequency int64
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy
// Private, used internally...
offset int
}
var Metrics map[string]MetricConfig
func GetMetricFrequency(metricName string) (int64, error) {
if metric, ok := Metrics[metricName]; ok {
return metric.Frequency, nil
}
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
}
// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency.
// use metric.Name to check if the metric already exists.
// if not, add it to the Metrics map.
func AddMetric(name string, metric MetricConfig) error {
if Metrics == nil {
Metrics = make(map[string]MetricConfig, 0)
}
if existingMetric, ok := Metrics[name]; ok {
if existingMetric.Frequency != metric.Frequency {
if existingMetric.Frequency < metric.Frequency {
existingMetric.Frequency = metric.Frequency
Metrics[name] = existingMetric
}
}
} else {
Metrics[name] = metric
}
return nil
}

View File

@@ -0,0 +1,95 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
const configSchema = `{
"type": "object",
"description": "Configuration specific to built-in metric-store.",
"properties": {
"checkpoints": {
"description": "Configuration for checkpointing the metrics within metric-store",
"type": "object",
"properties": {
"file-format": {
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
"type": "string"
},
"interval": {
"description": "Interval at which the metrics should be checkpointed.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the checkpointed files should be placed.",
"type": "string"
},
"restore": {
"description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.",
"type": "string"
}
}
},
"archive": {
"description": "Configuration for archiving the already checkpointed files.",
"type": "object",
"properties": {
"interval": {
"description": "Interval at which the checkpointed files should be archived.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the archived files should be placed.",
"type": "string"
}
}
},
"retention-in-memory": {
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
"type": "string"
},
"nats": {
"description": "Configuration for accepting published data through NATS.",
"type": "array",
"items": {
"type": "object",
"properties": {
"address": {
"description": "Address of the NATS server.",
"type": "string"
},
"username": {
"description": "Optional: If configured with username/password method.",
"type": "string"
},
"password": {
"description": "Optional: If configured with username/password method.",
"type": "string"
},
"creds-file-path": {
"description": "Optional: If configured with Credential File method. Path to your NATS cred file.",
"type": "string"
},
"subscriptions": {
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
"type": "array",
"items": {
"type": "object",
"properties": {
"subscribe-to": {
"description": "Channel name",
"type": "string"
},
"cluster-tag": {
"description": "Optional: Allow lines without a cluster tag, use this as default",
"type": "string"
}
}
}
}
}
}
}
}
}`

View File

@@ -0,0 +1,112 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"bufio"
"fmt"
"strconv"
)
func (b *buffer) debugDump(buf []byte) []byte {
if b.prev != nil {
buf = b.prev.debugDump(buf)
}
start, len, end := b.start, len(b.data), b.start+b.frequency*int64(len(b.data))
buf = append(buf, `{"start":`...)
buf = strconv.AppendInt(buf, start, 10)
buf = append(buf, `,"len":`...)
buf = strconv.AppendInt(buf, int64(len), 10)
buf = append(buf, `,"end":`...)
buf = strconv.AppendInt(buf, end, 10)
if b.archived {
buf = append(buf, `,"saved":true`...)
}
if b.next != nil {
buf = append(buf, `},`...)
} else {
buf = append(buf, `}`...)
}
return buf
}
func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) {
l.lock.RLock()
defer l.lock.RUnlock()
for i := 0; i < depth; i++ {
buf = append(buf, '\t')
}
buf = append(buf, '"')
buf = append(buf, lvlname...)
buf = append(buf, "\":{\n"...)
depth += 1
objitems := 0
for name, mc := range m.Metrics {
if b := l.metrics[mc.offset]; b != nil {
for i := 0; i < depth; i++ {
buf = append(buf, '\t')
}
buf = append(buf, '"')
buf = append(buf, name...)
buf = append(buf, `":[`...)
buf = b.debugDump(buf)
buf = append(buf, "],\n"...)
objitems++
}
}
for name, lvl := range l.children {
_, err := w.Write(buf)
if err != nil {
return nil, err
}
buf = buf[0:0]
buf, err = lvl.debugDump(m, w, name, buf, depth)
if err != nil {
return nil, err
}
buf = append(buf, ',', '\n')
objitems++
}
// remove final `,`:
if objitems > 0 {
buf = append(buf[0:len(buf)-1], '\n')
}
depth -= 1
for i := 0; i < depth; i++ {
buf = append(buf, '\t')
}
buf = append(buf, '}')
return buf, nil
}
func (m *MemoryStore) DebugDump(w *bufio.Writer, selector []string) error {
lvl := m.root.findLevel(selector)
if lvl == nil {
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
}
buf := make([]byte, 0, 2048)
buf = append(buf, "{"...)
buf, err := lvl.debugDump(m, w, "data", buf, 0)
if err != nil {
return err
}
buf = append(buf, "}\n"...)
if _, err = w.Write(buf); err != nil {
return err
}
return w.Flush()
}

Some files were not shown because too many files have changed in this diff Show More