839 Commits

Author SHA1 Message Date
dependabot[bot]
5d518208e4 Bump svelte
Bumps the npm_and_yarn group with 1 update in the /web/frontend directory: [svelte](https://github.com/sveltejs/svelte/tree/HEAD/packages/svelte).


Updates `svelte` from 5.53.2 to 5.53.6
- [Release notes](https://github.com/sveltejs/svelte/releases)
- [Changelog](https://github.com/sveltejs/svelte/blob/main/packages/svelte/CHANGELOG.md)
- [Commits](https://github.com/sveltejs/svelte/commits/svelte@5.53.6/packages/svelte)

---
updated-dependencies:
- dependency-name: svelte
  dependency-version: 5.53.6
  dependency-type: direct:development
  dependency-group: npm_and_yarn
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-02-28 05:31:13 +00:00
Jan Eitzinger
a6e23dd52e Merge pull request #503 from ClusterCockpit/dev
Add csv export for user/prject list
2026-02-24 20:29:51 +01:00
45a1bc78b7 Add csv export for user/prject list
Fixes #389
2026-02-24 20:26:12 +01:00
Jan Eitzinger
8bacffbd3e Merge pull request #502 from ClusterCockpit/dev
reintroduce state information for pie charts
2026-02-24 20:09:07 +01:00
Christoph Kluge
ff180affd7 reintroduce state information for pie charts
- removed by oversight
2026-02-24 10:07:09 +01:00
Jan Eitzinger
248b923980 Merge pull request #501 from ClusterCockpit/dev
Update jobclass rules
2026-02-24 07:03:23 +01:00
5d136634a2 Update jobclass rules 2026-02-24 07:00:15 +01:00
Jan Eitzinger
ac5ee1564a Merge pull request #500 from ClusterCockpit/dev
Dev
2026-02-24 06:46:49 +01:00
998aff2345 Fix bug in taggers. Allow to add multiple tags to one job 2026-02-24 06:44:17 +01:00
Jan Eitzinger
0dea959391 Merge pull request #498 from ClusterCockpit/add_GetMemoryDomainsBySocket_2026
migrate changes from cc-backend PR#364 in 2026
2026-02-23 18:52:00 +01:00
dadcb983e7 Merge branch 'dev' into add_GetMemoryDomainsBySocket_2026 2026-02-23 18:47:03 +01:00
8989b7a410 Update cclib 2026-02-23 18:45:41 +01:00
Jan Eitzinger
2b56e02a3e Merge pull request #499 from ClusterCockpit/dev
Fix and extend jobclass rules
2026-02-23 17:58:14 +01:00
Michael Panzlaff
bcfe9022de Makefile: use npm ci 2026-02-23 14:51:19 +01:00
Christoph Kluge
31f3c28294 Remove schedState and metaData from metricHealth table 2026-02-23 13:26:06 +01:00
Jan Eitzinger
12c01655c3 Merge pull request #496 from ClusterCockpit/add_editmetabyrequest_2026
Manually migrate editMetaByRequest to current state of 2026
2026-02-23 12:53:07 +01:00
ab55ce91a1 Change omit tagged key to kebab case 2026-02-23 09:02:36 +01:00
03c65e06f6 Allow finer control for omit tagged jobs in retention policies 2026-02-23 08:46:47 +01:00
defa8fa994 Update ReleaseNotes for v1.5.0 2026-02-23 08:45:49 +01:00
c9d8de0d56 Fix and extend jobclass rules 2026-02-22 13:27:51 +01:00
Jan Eitzinger
2b788f14ec Merge pull request #495 from ClusterCockpit/dev
Fix more bugs related to job_cache ids used in job table
2026-02-22 09:57:37 +01:00
86fbecc679 Update frontend deps 2026-02-22 09:45:48 +01:00
8ee6c09e9b Enable to run taggers from within admin web interface 2026-02-22 09:42:53 +01:00
fc1ba1f5b3 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-21 13:52:14 +01:00
82e79b074a Reverse Lookup order in stop job request 2026-02-21 13:51:31 +01:00
Christoph Kluge
bae7ec11b4 migrate changes from cc-backend PR#364 2026-02-20 15:10:02 +01:00
Christoph Kluge
2d90fd05d6 Manually migrate editMetaByRequest to current state of 2026
- originally PR #400
2026-02-20 10:38:07 +01:00
e1c1148160 Fix more bugs related to job_cache ids used in job table 2026-02-20 09:20:18 +01:00
Jan Eitzinger
5ee3bbdbf5 Merge pull request #494 from ClusterCockpit/dev
Dev
2026-02-20 08:43:54 +01:00
dc161ec421 Fix bug in SyncJobs causing start job hooks being called with job_cache ID 2026-02-20 08:29:41 +01:00
abd11d783b Use alternative solution to deal with NULL values in health_state column 2026-02-20 08:12:15 +01:00
Jan Eitzinger
39c919bb0c Merge pull request #493 from ClusterCockpit/dev
Dev
2026-02-20 07:49:13 +01:00
064aa0a238 Add logging for AppTagger 2026-02-20 07:44:00 +01:00
f00f9fcee0 Cleanup debug printf 2026-02-19 17:42:45 +01:00
Christoph Kluge
705d70ddc0 add row plot cursor syncing 2026-02-19 15:42:20 +01:00
Christoph Kluge
7789489d08 fix automatic refresh in metric status tab 2026-02-19 14:25:03 +01:00
Christoph Kluge
62cd21eb83 switch to select filters 2026-02-19 13:01:02 +01:00
90b52f997d Cleanup and handle error in AppTagger 2026-02-19 08:24:39 +01:00
745c0357f3 Handle NULL values on health_metrics column 2026-02-19 08:04:45 +01:00
Jan Eitzinger
087c00eb7f Merge pull request #492 from ClusterCockpit/dev
Add log output and fix bugs in AppTagger
2026-02-19 06:37:23 +01:00
Christoph Kluge
facfc9d4c9 increase column size 2026-02-18 16:22:02 +01:00
Christoph Kluge
89e06c3530 fix layout of metrichealth table 2026-02-18 16:15:04 +01:00
57536d982c Add log output and fix bugs in AppTagger 2026-02-18 16:00:50 +01:00
Jan Eitzinger
0c0f423b84 Merge pull request #491 from ClusterCockpit/dev
Dev
2026-02-18 10:31:37 +01:00
7bd79dcc3c Add nodestate retention in example config 2026-02-18 10:28:41 +01:00
2a659915a4 Update config schema 2026-02-18 10:18:33 +01:00
Jan Eitzinger
c77e2969c5 Merge pull request #490 from ClusterCockpit/dev
Dev
2026-02-18 09:24:39 +01:00
d8ad6dd3f0 Upgrade cc-lib to 2.5.1 2026-02-18 09:21:05 +01:00
cd8b574cca Rename nodestate retention policy 2026-02-18 08:38:22 +01:00
2da35909c1 Optimize sort order in job parquet files 2026-02-18 08:13:00 +01:00
2e24fde430 Optimize sort order in nodestate parquet files 2026-02-18 08:06:00 +01:00
757be60b22 Switch to zstd compression for parquet writers 2026-02-18 07:55:28 +01:00
589149790f Upgrade dependencies 2026-02-18 07:45:45 +01:00
29c440a637 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-17 21:06:20 +01:00
6035b62734 Run go fix 2026-02-17 21:04:17 +01:00
Christoph Kluge
fe3ebe0abc add healthStatus tab to status details, add graphql endpoints and handlers 2026-02-17 14:38:06 +01:00
Jan Eitzinger
a52d7d017f Merge pull request #489 from ClusterCockpit/dev
Dev
2026-02-17 08:22:10 +01:00
9af44779aa Add error handling in tagger initialization 2026-02-17 08:13:50 +01:00
Aditya Ujeniya
1cf2c41bd7 Resize the buffers and put them into the pool 2026-02-16 18:21:45 +01:00
Aditya Ujeniya
2eeefc2720 Add healthCheck support for external CCMS 2026-02-16 16:57:17 +01:00
Jan Eitzinger
0dd894890f Merge pull request #488 from ClusterCockpit/log-aggregator
Log aggregator
2026-02-16 09:38:31 +01:00
25ff094bdf Fix log endpoint 2026-02-16 09:10:08 +01:00
d2ff4a2e02 Fix log endpoint and reformat 2026-02-16 09:09:50 +01:00
4b983e3b9b Add server flag to example systemd unit 2026-02-16 09:08:57 +01:00
98f9c18f72 Merge branch 'dev' into log-aggregator 2026-02-15 19:53:34 +01:00
7d8b305cd9 Add format conversion to archive manager 2026-02-13 13:54:10 +01:00
6e3462f962 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-13 12:19:33 +01:00
2c8608f5a4 Update job archive retention to uniform policy with json and parquet target format 2026-02-13 12:19:31 +01:00
Christoph Kluge
3215bc3de0 review loading indicators in nodeList 2026-02-13 11:58:52 +01:00
140c7f6e47 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-12 15:45:20 +01:00
c15f1117f5 Review and improve node repo queries 2026-02-12 15:45:15 +01:00
Christoph Kluge
48729b172d improve nodeList loading indicator, streamlining 2026-02-12 14:27:41 +01:00
Jan Eitzinger
76ce8122e2 Merge pull request #487 from ClusterCockpit/dev
Dev
2026-02-12 11:30:24 +01:00
f016bd4232 Extend node repository unit tests 2026-02-12 09:30:14 +01:00
54ea5d7900 Add nodestate retention and archiving 2026-02-12 09:21:44 +01:00
865cd3db54 Prersist faulty nodestate metric lists to db 2026-02-12 08:48:15 +01:00
90c8fbf07c Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-12 07:34:26 +01:00
f4ee0d1042 Update cc-lib and extend nodestate sql schema 2026-02-12 07:34:24 +01:00
Christoph Kluge
e75da7f8cc fix reactivity key placement in nodeList 2026-02-11 18:32:29 +01:00
Christoph Kluge
12e9f6700e fix nodeList resolver data handling, increase nodestate filter cutoff
- add comment on cutoff
2026-02-11 16:16:09 +01:00
bca7dd743b Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-11 07:06:33 +01:00
8d6c6b819b Update and port to cc-lib 2026-02-11 07:06:06 +01:00
Christoph Kluge
d07a537f0b move starttime range string 2026-02-10 17:24:30 +01:00
Christoph Kluge
5e4994a64c revert external config supply for nodeList component 2026-02-10 16:46:18 +01:00
Christoph Kluge
a5a1fd1a6a fix missing component argument 2026-02-10 15:47:38 +01:00
Christoph Kluge
49a1748641 add not configured info cards, show short job filter options if one active filter 2026-02-10 13:49:23 +01:00
Jan Eitzinger
a2f0b57ab9 Merge pull request #486 from ClusterCockpit/dev
Dev
2026-02-10 09:25:54 +01:00
0dff9fa07f Update docs and agent files 2026-02-10 09:17:34 +01:00
1feb3baf68 Create copy of test db before unit tests 2026-02-10 07:53:30 +01:00
d21943a514 Upgrade cc-lib 2026-02-10 07:52:58 +01:00
035ac2384e Refactor GlobalMetricLists 2026-02-09 21:56:41 +01:00
ac7eb93141 fix: Transfer always to main job table before archiving 2026-02-09 19:57:46 +01:00
Christoph Kluge
51e9d33f9f fix empty availability print case 2026-02-09 17:21:49 +01:00
Christoph Kluge
d1e7ea09bc review handling of disabled metrics in frontend 2026-02-09 15:33:59 +01:00
Christoph Kluge
7dd3ee3084 remove undocumented minRunningFor filter, add short jobs quick selection instead 2026-02-09 12:23:21 +01:00
1980ef5f43 Renaming due to linter complaints 2026-02-09 09:17:01 +01:00
fd9b76c6a7 Security hardening of ldap and oicd auth implementations 2026-02-09 09:12:06 +01:00
abdd7ea6f1 Merge branch 'master' into dev 2026-02-09 07:46:44 +01:00
c7b366f35f Put notFoundHandler earlier to also catch subrouters 2026-02-09 07:46:37 +01:00
Jan Eitzinger
84e715a273 Merge pull request #485 from ClusterCockpit/change-web-framework
Change web framework
2026-02-07 18:38:05 +01:00
396a628175 Merge branch 'master' into change-web-framework 2026-02-07 18:29:44 +01:00
624746f34b Fix 404 handler route 2026-02-07 18:29:27 +01:00
2b395a94e6 Fix setup issue with chi router 2026-02-07 18:02:48 +01:00
f6aa40d927 Migrate from gorilla to chi web framework. add 404 handler 2026-02-07 17:48:12 +01:00
c920c57f5d Add parquet file job archiving target 2026-02-07 10:51:56 +01:00
363e839c49 Add simple log viewer in web frontend 2026-02-07 07:05:33 +01:00
Jan Eitzinger
e681e9e7ec Merge pull request #484 from ClusterCockpit/dev
Dev
2026-02-07 06:23:44 +01:00
a8194de492 Add diagnostic output for healthcheck 2026-02-07 06:17:34 +01:00
Aditya Ujeniya
1145b31a49 Updated dataGenerator.sh script 2026-02-06 18:07:32 +01:00
Christoph Kluge
b7df1f56ef fix missing sveltetrap component 2026-02-06 18:05:55 +01:00
Christoph Kluge
c43d4a0f16 complete review of context initialization and access, streamlining 2026-02-06 17:51:57 +01:00
a8d385a1ee Update HealthCheck again Still WIP 2026-02-06 16:35:02 +01:00
5579b6f40c Adopt unit test to new API 2026-02-06 16:11:10 +01:00
7123a8c1cc Updated HealthCheck implementation WIP 2026-02-06 16:04:01 +01:00
Jan Eitzinger
9cc09145ec Merge pull request #483 from ClusterCockpit/dev
Dev
2026-02-06 14:55:11 +01:00
6294f8e263 Review and improve detectApp implementation 2026-02-06 14:53:05 +01:00
0adf2bad92 Add info log about applied tag 2026-02-06 14:50:13 +01:00
a85f72fccd Change log level to debug for nodestate API endpoint 2026-02-06 14:30:04 +01:00
db8772dc0b Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-06 14:06:59 +01:00
fa7727c6ca Print job db id instead of its address 2026-02-06 14:06:56 +01:00
Christoph Kluge
5655639320 add subCluster jobFilter for statusDetail queries 2026-02-06 11:10:08 +01:00
Jan Eitzinger
720f40c9c9 Merge pull request #482 from ClusterCockpit/dev
Dev
2026-02-06 09:51:47 +01:00
f671d8df90 Add counts in healthcheck for logging output 2026-02-06 09:25:09 +01:00
265da42385 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-06 09:08:50 +01:00
b160284a1b Update vasp example tagger rule 2026-02-06 09:08:46 +01:00
Aditya Ujeniya
fcb37b0367 Update to count healthy metrics 2026-02-06 08:45:36 +01:00
0984c1d431 Add debug log with degrade and missing metrics for healthcheck 2026-02-06 07:21:04 +01:00
Christoph Kluge
276559d120 revert endpoint change 2026-02-05 15:25:49 +01:00
Christoph Kluge
e3148b16eb add timers to updateNodeStates 2026-02-05 15:24:11 +01:00
Christoph Kluge
4d13c37008 remove non-required return in updatenodehandler 2026-02-05 15:18:01 +01:00
Christoph Kluge
84d7a7aa7d add running default filter to list name column links if useful 2026-02-05 13:06:46 +01:00
Christoph Kluge
5616801f3e review user vinformation block, reorder reactive var in jobList 2026-02-05 10:33:20 +01:00
Jan Eitzinger
b5f6ee9c0c Merge pull request #481 from ClusterCockpit/dev
Dev
2026-02-04 19:46:02 +01:00
Christoph Kluge
af73ce9c6d Add usage status tabs for each subCluster 2026-02-04 18:02:21 +01:00
Christoph Kluge
a7a95bb866 add shortjobs and resource sums to project and user lists 2026-02-04 13:01:30 +01:00
5d7dd62b72 Update unit test for new HealthCheck update 2026-02-04 12:53:24 +01:00
46fb52d67e Adopt documentation 2026-02-04 12:30:33 +01:00
Aditya Ujeniya
39b8356683 Optimized CCMS healthcheck 2026-02-04 10:24:45 +01:00
42ce598865 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-03 18:35:35 +01:00
0d62a300e7 Intermediate state of node Healthcheck
TODOS:
* Remove error handling from routine and simplify API call
* Use map for hardware level metrics
2026-02-03 18:35:17 +01:00
Aditya Ujeniya
3cf88f757c Update to checkpoint loader in CCMS 2026-02-03 16:25:48 +01:00
75a74c162d Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-02-03 14:55:47 +01:00
248f11f4f8 Change API of Node HealthState 2026-02-03 14:55:12 +01:00
Christoph Kluge
75e849922d Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-02-03 14:34:57 +01:00
Christoph Kluge
d39b955b25 fix doubleMetric xlegend label 2026-02-03 14:34:53 +01:00
00a41373e8 Add monitoring healthstate support in nodestate API. 2026-02-03 12:23:24 +01:00
Christoph Kluge
e9cd6b4225 set updateNodeStates timeStamp once per request
-prevents per-host timestamp mismatches due to handler iteration duration
2026-02-02 17:51:41 +01:00
Christoph Kluge
13cca1ee62 change log msg on clusterMetrics 2026-02-02 14:45:19 +01:00
Christoph Kluge
7b4e2fcf59 add isNan to clusterMetric aggregation 2026-02-02 14:34:49 +01:00
Christoph Kluge
f2285e603b fix disabled-false-positives, add info if no metrics selected 2026-02-02 10:38:05 +01:00
Christoph Kluge
b7bd8210e5 revert column defaults,keep general desc default order 2026-01-31 20:34:16 +01:00
Christoph Kluge
1791e665aa add default orders on list sorting change 2026-01-31 11:25:16 +01:00
Aditya Ujeniya
a71341064e Update to MetricStore HealthCheck API 2026-01-30 23:24:16 +01:00
Jan Eitzinger
74dbbaa794 Merge pull request #480 from ClusterCockpit/dev
Change web ui defaults and fix ui config json schema
2026-01-30 18:20:02 +01:00
Christoph Kluge
732fab4a04 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-30 17:22:42 +01:00
Christoph Kluge
aa3fcbfe17 add compareJobs feature to user job list view 2026-01-30 17:22:39 +01:00
5e58c9f376 Adapt unit test to changed implementation 2026-01-30 16:49:55 +01:00
b600eeca5e Change web ui defaults and fix ui config json schema 2026-01-30 16:31:09 +01:00
Jan Eitzinger
e6662c4592 Merge pull request #479 from ClusterCockpit/dev
Dev
2026-01-30 16:26:27 +01:00
Christoph Kluge
1ffcc5e241 apply ccUnit for clusterMetrics normalitazion 2026-01-30 15:52:59 +01:00
Christoph Kluge
32f0664012 add indicator to nodeView state, cap bubble size in roofline 2026-01-30 14:32:41 +01:00
Christoph Kluge
2ef1826b12 format nodeOverview state badges, review textFilter
- textFilter now remains on compare
2026-01-30 12:51:49 +01:00
Christoph Kluge
d397457ce6 disable forced filter reset on compare 2026-01-29 18:05:03 +01:00
Christoph Kluge
e8c81ba7d4 various small dashboard fixes
- piechart color, idle count cap, metricHistoMaximum increased
2026-01-29 17:46:01 +01:00
Christoph Kluge
318dbd65e0 dynamically scale dashboard sums 2026-01-29 17:01:07 +01:00
Christoph Kluge
dd56e75b50 improve detail on warning cards 2026-01-29 16:02:13 +01:00
Christoph Kluge
df93786474 fix system timeselect display
- float mismatch triggered unknownValue
2026-01-29 15:22:40 +01:00
Christoph Kluge
4deec9a170 no append if ErrNoHostOrMetric fired 2026-01-29 15:18:50 +01:00
Christoph Kluge
f26cabbdf1 Streamline missing data warnings, review logging 2026-01-29 15:17:33 +01:00
Jan Eitzinger
195a1edcfe Merge pull request #478 from ClusterCockpit/dev
Dev
2026-01-29 12:39:25 +01:00
Aditya Ujeniya
7101d2bb3b Handle the metric/host not found case differently 2026-01-28 17:47:38 +01:00
Christoph Kluge
3452891613 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-28 16:50:32 +01:00
Christoph Kluge
b25abc5f16 fix selectedMEtric presets in jobList
- remove conflicting reactivity
2026-01-28 16:50:27 +01:00
Jan Eitzinger
60a847922e Merge pull request #477 from ClusterCockpit/dev
Dev
2026-01-28 11:24:10 +01:00
0d857b49a2 Disable explicit GC calls 2026-01-28 11:21:27 +01:00
eb5aa9ad02 Disable explicit GC calls 2026-01-28 11:21:02 +01:00
98661aad15 Increase default GC frequency 2026-01-28 10:41:44 +01:00
69739ffdfd Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-28 07:05:32 +01:00
95689e3c99 Add API endpoint for getUsedNodes
Needed by dynamic memory management for external ccms
2026-01-28 07:05:29 +01:00
Christoph Kluge
9d9babe94d review clusterMetrics aggregation handling, fixes index error 2026-01-27 19:04:29 +01:00
9d15a87c88 Take into account the real allocated heap memory in MemoryUsageTracker 2026-01-27 18:23:09 +01:00
Christoph Kluge
719aaaff4b fix data flipping on doublemetric render 2026-01-27 17:39:16 +01:00
bbde91a1f9 Move wg increment inside goroutines. Make GC calls less aggressive 2026-01-27 17:25:29 +01:00
55cb2cb6d6 Prevent file not closed on error in avro checkpoint 2026-01-27 17:10:26 +01:00
752e19c276 Pull out metric List build from metricstore Init 2026-01-27 17:06:52 +01:00
Jan Eitzinger
dac382af53 Merge pull request #476 from ClusterCockpit/dev
Dev
2026-01-27 16:03:12 +01:00
28a3ff8d67 Upgrade cc-lib. Change nodestate log levels 2026-01-27 14:26:41 +01:00
Christoph Kluge
ae81687da9 add jobList load indicator 2026-01-27 13:49:15 +01:00
Christoph Kluge
2173d3527d add rounding to statsTable data 2026-01-27 13:31:16 +01:00
2859f12dc1 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-27 10:02:39 +01:00
b307e885ce feat: Add support for multiple external metric stores 2026-01-27 10:02:07 +01:00
4853814228 Review and refactor metric-store client. Add documentation 2026-01-27 07:17:17 +01:00
ca08717b9d Reintroduce external metric-store client 2026-01-27 07:00:13 +01:00
Christoph Kluge
934bc13c2c add fallback case to extendedLegend render 2026-01-26 18:09:54 +01:00
Christoph Kluge
4aa337ccc8 fix missing index change 2026-01-26 16:05:42 +01:00
Christoph Kluge
700f2aad55 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-26 15:53:02 +01:00
Christoph Kluge
836e6e4242 Review duration filter handling, update migration indices 2026-01-26 15:53:00 +01:00
b5182c4c13 Prepare release 1.5.0 2026-01-26 15:29:28 +01:00
Jan Eitzinger
808e281ee8 Merge pull request #475 from ClusterCockpit/dev
Dev
2026-01-26 14:09:25 +01:00
b9e65b50db Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-26 13:17:38 +01:00
0ea836c69c Revert metricstore api paths 2026-01-26 13:17:36 +01:00
Christoph Kluge
e074bb315c review job info row layout 2026-01-26 11:53:19 +01:00
Jan Eitzinger
084d00cb0d Merge pull request #474 from ClusterCockpit/dev
Dev
2026-01-26 09:27:52 +01:00
7ecfc8468e Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-26 08:38:56 +01:00
c782043c64 Upgrade cclib and remove usage of obsolete util.Float 2026-01-26 08:38:53 +01:00
Christoph Kluge
fbf4004e92 move load indicator to after job list render
- prevents location reset when continuously scrolling
2026-01-23 18:50:28 +01:00
Christoph Kluge
a2c1b65f91 set variables in nodeitems handler 2026-01-23 18:42:07 +01:00
Christoph Kluge
0af550bf4e fix nodelist paging 2026-01-23 18:26:10 +01:00
Christoph Kluge
436194e46d enable fallback for non initialized node table 2026-01-23 17:45:44 +01:00
Christoph Kluge
49938bcef8 remove blocking backend check
- threw errors on expected and correctly handled behavior for nodeList queries
2026-01-23 17:41:21 +01:00
Christoph Kluge
da2a78faa3 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-23 16:54:35 +01:00
Christoph Kluge
98dc8cf5b5 add loading indicators to status detail 2026-01-23 16:54:30 +01:00
Aditya Ujeniya
cd810b45ec Adding dataGenerator script 2026-01-23 16:51:17 +01:00
Christoph Kluge
22b1d4d276 review public dashboard layout 2026-01-23 15:25:00 +01:00
Christoph Kluge
25c5457ef3 review internal dash info card layout 2026-01-23 15:14:54 +01:00
Christoph Kluge
ea6b9d910b switch to cluster grouping in dashboard queries 2026-01-23 13:39:34 +01:00
Christoph Kluge
5567371ccd move list filter preset to url 2026-01-23 11:20:32 +01:00
Jan Eitzinger
585c4fcace Merge pull request #473 from ClusterCockpit/dev
Dev
2026-01-23 10:17:28 +01:00
525d99140f Fix missing comma in metristore schema 2026-01-23 10:15:29 +01:00
499b4287f9 Switch to cclib nats client 2026-01-23 10:04:41 +01:00
Jan Eitzinger
b7df4f7cca Merge pull request #472 from ClusterCockpit/dev
Dev
2026-01-23 07:54:17 +01:00
f41301036b Move metricstore from internal to pkg 2026-01-23 07:49:47 +01:00
30516776e5 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-22 20:31:18 +01:00
07afcc4cd4 Adopt naming of descriptions 2026-01-22 20:31:15 +01:00
Michael Panzlaff
05abea87e7 Do not warn about unencrypted auth when encrypted revsere proxy is used 2026-01-22 14:06:34 +01:00
Christoph Kluge
4459840f5f chage to adaptive row sizes for publicDash 2026-01-21 18:14:15 +01:00
Christoph Kluge
55e0456aac Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-21 12:21:56 +01:00
Christoph Kluge
f18ae35030 review plot rendering and doublemetric opts 2026-01-21 12:21:52 +01:00
Christoph Kluge
f416be77f7 add publicmode to doublemetric plot, 2026-01-20 16:26:00 +01:00
1d4c79c821 Unify JSON attribute naming ot use kebab style case. Cleanup configuration. 2026-01-20 09:47:13 +01:00
d4edbd7d1a Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-19 16:43:04 +01:00
5281f3bb60 Remove obsolete config option disable-archive 2026-01-19 16:42:30 +01:00
Christoph Kluge
e91fbf405f xMerge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-19 09:44:36 +01:00
Christoph Kluge
1d41ff8190 add jobSummary conditional display 2026-01-16 17:26:24 +01:00
Aditya Ujeniya
77a2a256e4 Update to MemoryUsage worker 2026-01-16 17:01:39 +01:00
Christoph Kluge
eb09504306 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-16 14:14:45 +01:00
Christoph Kluge
b912be5978 Review polar plot scaling, make tabbing dependent on config
- remove polar min data
2026-01-16 14:14:41 +01:00
Aditya Ujeniya
1a41629535 Refactor Archive to Cleanup and rename everything 2026-01-16 14:09:50 +01:00
Christoph Kluge
b81d9b05ac add subcluster reactivity to jobList 2026-01-16 12:35:17 +01:00
Christoph Kluge
1d62ee1e22 rollback urql version bump
- breaks gql queries as is
2026-01-16 10:43:54 +01:00
Christoph Kluge
55d2c7d7eb Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-16 09:32:05 +01:00
Christoph Kluge
bb527fb410 bump frontend dependencies 2026-01-16 09:32:02 +01:00
9a97d0e8eb Add documentation 2026-01-16 08:27:46 +01:00
93dcfee8c5 Document and refactor 2026-01-16 08:27:15 +01:00
76139ef53c Remove now optional apiAllowedIPs option 2026-01-16 08:23:31 +01:00
Aditya Ujeniya
32319adf72 Add Memory Tracker worker for CCMS 2026-01-15 21:29:21 +01:00
Aditya Ujeniya
10a5c89a16 Fix logic for findFiles() and keep archive worker 2026-01-15 20:27:11 +01:00
Christoph Kluge
40bff1eff9 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-15 18:18:54 +01:00
Christoph Kluge
ceba4eb0c6 Review dashboards, add twosided progress to indernal dash 2026-01-15 18:18:51 +01:00
Christoph Kluge
faacf3f343 svelte state_referenced_locally warning fixes
- change to derived where possible
- suppress warning elsewhere
- discussion here: sveltejs/svelte/issues/17289
2026-01-15 18:17:45 +01:00
Aditya Ujeniya
7cd98c4f25 Test and update files for dynamic retention 2026-01-15 17:48:59 +01:00
Michael Panzlaff
489ad44b9f Make apiAllowedIPs optional
If our test and production instance just use *, one might as well make
that the default value. This should ease configuration for minimal
setups.
2026-01-15 16:08:29 +01:00
Jan Eitzinger
02a8cf05d1 Merge pull request #471 from ClusterCockpit/dev
Dev
2026-01-15 16:00:30 +01:00
7db2bbe6b0 Add job tagger option to example config 2026-01-15 15:53:54 +01:00
b6f0faa97f Make polarplot default in Jobview 2026-01-15 15:47:40 +01:00
a3fffa8e8b Update example and demo config 2026-01-15 13:57:15 +01:00
72248defbf Cleanup print statements. Always enable Compression 2026-01-15 13:39:22 +01:00
155e05495e Fix shutdown timout bug 2026-01-15 13:29:19 +01:00
9c92a7796b Introduce nodeprovider interface to break import cycle 2026-01-15 12:20:11 +01:00
7c78407c49 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-15 11:34:09 +01:00
cb219b3c74 Fix configuration issues. Fix shutdown hangs
Always turn on compression
2026-01-15 11:34:06 +01:00
d59aa2e855 Restructure configuration with sensible defaults. Fix shutdown hangs 2026-01-15 11:33:01 +01:00
Christoph Kluge
cd3d133f0d Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-15 10:12:02 +01:00
Christoph Kluge
3b7fc44ce9 specify job count return logs, move to debug level 2026-01-15 10:11:31 +01:00
e1efc68476 Update dependencies. Rebuild graphql and swagger 2026-01-15 08:32:06 +01:00
8f0bb907ff Improve documentation and add more tests 2026-01-15 06:41:23 +01:00
Christoph Kluge
e5c620ca20 filter metrics for NodeMetrics query 2026-01-14 19:22:31 +01:00
Christoph Kluge
d0bcfb90e6 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-14 17:18:25 +01:00
Christoph Kluge
9deee54e41 make nextPage query conditional if no return limit is requested 2026-01-14 17:18:21 +01:00
Michael Panzlaff
94b86ef11a Mismatched event types are not something to be concerned about
If a different CCMessage type was sent over the same subject as
requested, that shouldn't raise a warning. This may happen in production
instances, but in order to ease debugging, lower it to 'debug' level.
2026-01-14 16:08:33 +01:00
Christoph Kluge
d8cd752dcb Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-14 16:03:39 +01:00
Christoph Kluge
5d376e6865 fix clustername array init size 2026-01-14 16:03:34 +01:00
9c3beddf54 Improve documentation 2026-01-14 15:39:38 +01:00
c6465ad9e5 Add s3 configuration options 2026-01-14 15:28:34 +01:00
d415381d4a Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-14 15:10:25 +01:00
211d4fae54 Refactor s3 backend and suppress checksum warning 2026-01-14 15:10:20 +01:00
Aditya Ujeniya
3276ed7785 Half-baked commit for new dynamic retention logic 2026-01-14 14:56:36 +01:00
Christoph Kluge
77b7548ef3 fix wrong varname 2026-01-14 13:00:36 +01:00
Christoph Kluge
59851f410e Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-14 11:25:43 +01:00
Christoph Kluge
4cb8d648cb adapt frontend to backend config changes, clarify variable names 2026-01-14 11:25:40 +01:00
c8627a13f4 Remove obsolete slusters config section 2026-01-14 11:17:49 +01:00
0ea0270fe1 Reintroduce Clusters as string list of cluster names 2026-01-14 10:37:07 +01:00
19402d30af Review and improve error messages and doc comments 2026-01-14 10:09:19 +01:00
b2f870e3c0 Convert nodestate nats API to influx line protocol payload. Review and add doc comments.
Improve and extend tests
2026-01-14 10:08:06 +01:00
9e542dc200 Review and improve, add documentation 2026-01-14 09:26:03 +01:00
6cf59043a3 Review and improve, add documentation 2026-01-14 08:59:27 +01:00
71b75eea0e Improve GetUsedNodes function 2026-01-14 08:49:55 +01:00
e900a686db Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2026-01-14 07:38:04 +01:00
fb8db3c3ae Add query which node metric data needs to be retained 2026-01-14 07:37:31 +01:00
Christoph Kluge
170a9ace8a Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2026-01-13 16:59:57 +01:00
Christoph Kluge
518e9950ea add job exclusivity filter, review db indices 2026-01-13 16:59:52 +01:00
25c8fca561 Revert retention config in metricstore 2026-01-13 14:42:24 +01:00
754f7e16f6 Reformat with gofumpt 2026-01-13 09:52:31 +01:00
04a2e460ae Refactor metricstore. Initial stub for cluster/ subcluster specific retention times 2026-01-13 09:52:00 +01:00
2ebab1e2e2 Reformat with gofumpt 2026-01-13 09:50:57 +01:00
a9366d14c6 Add README for tagging. Enable tagging by flag without configuration option 2026-01-13 08:32:32 +01:00
42809e3f75 Remove embedded tagger rules 2026-01-13 07:20:26 +01:00
4cec933349 Remove obsolete cluster config section 2026-01-13 06:28:33 +01:00
d3f3c532b1 Merge branch 'master' into dev 2026-01-12 11:18:56 +01:00
ad1e87d0b8 Disable dependabot alerts 2026-01-12 11:17:44 +01:00
Jan Eitzinger
affa85c086 Merge pull request #469 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2/credentials-1.19.7
Bump github.com/aws/aws-sdk-go-v2/credentials from 1.19.6 to 1.19.7
2026-01-12 10:30:35 +01:00
Jan Eitzinger
aa053d78f7 Merge pull request #470 from ClusterCockpit/dependabot/go_modules/github.com/mattn/go-sqlite3-1.14.33
Bump github.com/mattn/go-sqlite3 from 1.14.32 to 1.14.33
2026-01-12 10:30:16 +01:00
dependabot[bot]
fae6d9d835 Bump github.com/mattn/go-sqlite3 from 1.14.32 to 1.14.33
Bumps [github.com/mattn/go-sqlite3](https://github.com/mattn/go-sqlite3) from 1.14.32 to 1.14.33.
- [Release notes](https://github.com/mattn/go-sqlite3/releases)
- [Commits](https://github.com/mattn/go-sqlite3/compare/v1.14.32...v1.14.33)

---
updated-dependencies:
- dependency-name: github.com/mattn/go-sqlite3
  dependency-version: 1.14.33
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-12 08:52:44 +00:00
dependabot[bot]
78f1db7ad1 Bump github.com/aws/aws-sdk-go-v2/credentials from 1.19.6 to 1.19.7
Bumps [github.com/aws/aws-sdk-go-v2/credentials](https://github.com/aws/aws-sdk-go-v2) from 1.19.6 to 1.19.7.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/service/m2/v1.19.6...service/m2/v1.19.7)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2/credentials
  dependency-version: 1.19.7
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-12 08:52:40 +00:00
f1367f84f8 Merge branch 'master' into dev 2026-01-12 09:14:45 +01:00
Jan Eitzinger
4c81696f4d Merge pull request #455 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/rollup-4.54.0
Bump rollup from 4.53.3 to 4.54.0 in /web/frontend
2026-01-12 09:09:42 +01:00
Jan Eitzinger
a91f8f72e3 Merge pull request #459 from ClusterCockpit/dependabot/go_modules/golang.org/x/oauth2-0.34.0
Bump golang.org/x/oauth2 from 0.32.0 to 0.34.0
2026-01-12 09:09:18 +01:00
Jan Eitzinger
87f7ed329c Merge pull request #461 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/svelte-5.46.1
Bump svelte from 5.44.0 to 5.46.1 in /web/frontend
2026-01-12 09:08:49 +01:00
dependabot[bot]
8641d9053d Bump golang.org/x/oauth2 from 0.32.0 to 0.34.0
Bumps [golang.org/x/oauth2](https://github.com/golang/oauth2) from 0.32.0 to 0.34.0.
- [Commits](https://github.com/golang/oauth2/compare/v0.32.0...v0.34.0)

---
updated-dependencies:
- dependency-name: golang.org/x/oauth2
  dependency-version: 0.34.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-12 08:07:20 +00:00
Jan Eitzinger
4a5ab8a279 Merge pull request #462 from ClusterCockpit/dependabot/go_modules/github.com/99designs/gqlgen-0.17.85
Bump github.com/99designs/gqlgen from 0.17.84 to 0.17.85
2026-01-12 09:06:18 +01:00
Jan Eitzinger
d179412ab6 Merge pull request #463 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2/service/s3-1.95.0
Bump github.com/aws/aws-sdk-go-v2/service/s3 from 1.90.2 to 1.95.0
2026-01-12 09:05:50 +01:00
Jan Eitzinger
968c7d179d Merge pull request #464 from ClusterCockpit/dependabot/go_modules/github.com/go-co-op/gocron/v2-2.19.0
Bump github.com/go-co-op/gocron/v2 from 2.18.2 to 2.19.0
2026-01-12 09:05:29 +01:00
56399523d7 Update module deps 2026-01-12 09:00:06 +01:00
4d6326b8be Remove metricsync 2026-01-12 08:55:31 +01:00
dependabot[bot]
a2414791bf Bump github.com/go-co-op/gocron/v2 from 2.18.2 to 2.19.0
Bumps [github.com/go-co-op/gocron/v2](https://github.com/go-co-op/gocron) from 2.18.2 to 2.19.0.
- [Release notes](https://github.com/go-co-op/gocron/releases)
- [Commits](https://github.com/go-co-op/gocron/compare/v2.18.2...v2.19.0)

---
updated-dependencies:
- dependency-name: github.com/go-co-op/gocron/v2
  dependency-version: 2.19.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:05:04 +00:00
dependabot[bot]
faf3a19f0c Bump github.com/aws/aws-sdk-go-v2/service/s3 from 1.90.2 to 1.95.0
Bumps [github.com/aws/aws-sdk-go-v2/service/s3](https://github.com/aws/aws-sdk-go-v2) from 1.90.2 to 1.95.0.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/service/s3/v1.90.2...service/s3/v1.95.0)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2/service/s3
  dependency-version: 1.95.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:04:58 +00:00
dependabot[bot]
4e6038d6c1 Bump github.com/99designs/gqlgen from 0.17.84 to 0.17.85
Bumps [github.com/99designs/gqlgen](https://github.com/99designs/gqlgen) from 0.17.84 to 0.17.85.
- [Release notes](https://github.com/99designs/gqlgen/releases)
- [Changelog](https://github.com/99designs/gqlgen/blob/master/CHANGELOG.md)
- [Commits](https://github.com/99designs/gqlgen/compare/v0.17.84...v0.17.85)

---
updated-dependencies:
- dependency-name: github.com/99designs/gqlgen
  dependency-version: 0.17.85
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:03:41 +00:00
dependabot[bot]
ddc2ecf829 Bump svelte from 5.44.0 to 5.46.1 in /web/frontend
Bumps [svelte](https://github.com/sveltejs/svelte/tree/HEAD/packages/svelte) from 5.44.0 to 5.46.1.
- [Release notes](https://github.com/sveltejs/svelte/releases)
- [Changelog](https://github.com/sveltejs/svelte/blob/main/packages/svelte/CHANGELOG.md)
- [Commits](https://github.com/sveltejs/svelte/commits/svelte@5.46.1/packages/svelte)

---
updated-dependencies:
- dependency-name: svelte
  dependency-version: 5.46.1
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-29 08:02:46 +00:00
ecb5aef735 Fix build error in unit test 2025-12-25 08:48:03 +01:00
11ec2267da Major refactor of metric data handling
- make the  internal memory store required and default
- Rename memorystore to metricstore
- Rename metricDataDispatcher to metricdispatch
- Remove metricdata package
- Introduce metricsync package for upstream metric data pull
2025-12-25 08:42:54 +01:00
8576ae458d Switch to cc-lib v2 2025-12-24 09:24:18 +01:00
Jan Eitzinger
c66445acb5 Merge pull request #458 from ClusterCockpit/dependabot/go_modules/github.com/expr-lang/expr-1.17.7
Bump github.com/expr-lang/expr from 1.17.6 to 1.17.7
2025-12-23 21:16:13 +01:00
dependabot[bot]
29a20f7b0b Bump github.com/expr-lang/expr from 1.17.6 to 1.17.7
Bumps [github.com/expr-lang/expr](https://github.com/expr-lang/expr) from 1.17.6 to 1.17.7.
- [Release notes](https://github.com/expr-lang/expr/releases)
- [Commits](https://github.com/expr-lang/expr/compare/v1.17.6...v1.17.7)

---
updated-dependencies:
- dependency-name: github.com/expr-lang/expr
  dependency-version: 1.17.7
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-23 09:07:01 +00:00
Jan Eitzinger
874c019fb6 Merge pull request #457 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2/config-1.32.6
Bump github.com/aws/aws-sdk-go-v2/config from 1.31.20 to 1.32.6
2025-12-23 10:06:17 +01:00
Jan Eitzinger
54825626de Merge pull request #456 from ClusterCockpit/dependabot/go_modules/github.com/coreos/go-oidc/v3-3.17.0
Bump github.com/coreos/go-oidc/v3 from 3.16.0 to 3.17.0
2025-12-23 10:05:56 +01:00
9bf5c5dc1a Update README and config schema 2025-12-23 09:34:09 +01:00
64fef9774c Add unit test for NATS API 2025-12-23 09:22:57 +01:00
999667ec0c Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-23 07:56:16 +01:00
c1135531ba Port NATS api to ccMessages 2025-12-23 07:56:13 +01:00
287256e5f1 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-23 05:56:49 +01:00
0bc26aa194 Add error check 2025-12-23 05:56:46 +01:00
Christoph Kluge
502d7e9084 Rework info panel in public dashboard
- change to bootstrap grid from table
- add infos, use badges
- remove non required query
2025-12-22 17:26:56 +01:00
Christoph Kluge
89875db4a9 dashboard layout fixes 2025-12-22 10:39:40 +01:00
dependabot[bot]
5a8b929448 Bump github.com/aws/aws-sdk-go-v2/config from 1.31.20 to 1.32.6
Bumps [github.com/aws/aws-sdk-go-v2/config](https://github.com/aws/aws-sdk-go-v2) from 1.31.20 to 1.32.6.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/config/v1.31.20...v1.32.6)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2/config
  dependency-version: 1.32.6
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-22 08:04:43 +00:00
dependabot[bot]
fe78f2f433 Bump github.com/coreos/go-oidc/v3 from 3.16.0 to 3.17.0
Bumps [github.com/coreos/go-oidc/v3](https://github.com/coreos/go-oidc) from 3.16.0 to 3.17.0.
- [Release notes](https://github.com/coreos/go-oidc/releases)
- [Commits](https://github.com/coreos/go-oidc/compare/v3.16.0...v3.17.0)

---
updated-dependencies:
- dependency-name: github.com/coreos/go-oidc/v3
  dependency-version: 3.17.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-22 08:03:31 +00:00
dependabot[bot]
e37591ce6d Bump rollup from 4.53.3 to 4.54.0 in /web/frontend
Bumps [rollup](https://github.com/rollup/rollup) from 4.53.3 to 4.54.0.
- [Release notes](https://github.com/rollup/rollup/releases)
- [Changelog](https://github.com/rollup/rollup/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rollup/rollup/compare/v4.53.3...v4.54.0)

---
updated-dependencies:
- dependency-name: rollup
  dependency-version: 4.54.0
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-22 08:02:41 +00:00
1cd4a57bd3 Remove support for mysql/mariadb 2025-12-20 11:13:41 +01:00
b35172e2f7 Add context information for CLAUDE coding agent 2025-12-20 11:13:02 +01:00
3cfcd30128 Add CLAUDE.md documentation for Claude Code
Provides architecture overview, build commands, and development workflows
to help future Claude Code instances work productively in this codebase.
Includes guidance on GraphQL/REST API patterns, database migrations, and
the repository/metric data architecture.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-20 10:17:54 +01:00
e56532e5c8 Add example json API payloads 2025-12-20 09:35:54 +01:00
fdee4f8938 Integrate NATS API.
Only start either REST start/stop API or NATS start/stop API
2025-12-20 09:21:58 +01:00
Christoph Kluge
7acc89e42d move public dash close button 2025-12-19 17:52:21 +01:00
Christoph Kluge
af7d208c21 remove unused class 2025-12-19 16:16:57 +01:00
Christoph Kluge
91b90d033e fix metric select drag and drop 2025-12-19 15:27:35 +01:00
Christoph Kluge
7a0975b94d final fix render race condition if metrics change in nodeList 2025-12-19 15:10:15 +01:00
Christoph Kluge
c58b01a602 fix wrong render condition order in nodeList 2025-12-19 14:42:02 +01:00
Christoph Kluge
8244449646 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-12-18 15:55:40 +01:00
Christoph Kluge
436afa4a61 fix tag count by including type in grouping 2025-12-18 15:55:30 +01:00
Jan Eitzinger
998f800632 Merge pull request #454 from ClusterCockpit/dev
Dev
2025-12-18 15:52:06 +01:00
06ed056d43 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-18 15:47:53 +01:00
d446c13546 Restore startDemo script 2025-12-18 15:47:51 +01:00
6e74fa294a Add role-based visibility for metrics
Fixes #387
2025-12-18 15:47:30 +01:00
Christoph Kluge
43bdb56072 add fallback case if metric has no name in nodeListRow 2025-12-18 15:04:03 +01:00
Jan Eitzinger
10a0b0add8 Merge pull request #452 from ClusterCockpit/dev
Dev
2025-12-18 11:28:07 +01:00
e707fd0893 Provide fallback in archive manager in case fd is not available 2025-12-18 11:26:05 +01:00
Christoph Kluge
19c8e9beb1 move extensive NodeMetricsList handling to node repo func 2025-12-18 10:44:58 +01:00
Aditya Ujeniya
32e5353847 Fix to NATS deadlock and revert demo script 2025-12-17 18:14:36 +01:00
Aditya Ujeniya
d2f2d78954 Changing JWT output to stdout and change to help text 2025-12-17 15:58:42 +01:00
b8fdfc30c0 Fix performance bugs in sqlite archive backend 2025-12-17 10:12:49 +01:00
79a2ca8ae8 Adapt unit test to new API 2025-12-17 08:44:37 +01:00
d1a78c13a4 Make loglevel info default for demo 2025-12-17 08:38:14 +01:00
f4b00e9de1 Use Info instead of warn loglevel for database file missing msg 2025-12-17 08:38:00 +01:00
0a5e155096 Remove debug setting 2025-12-17 07:03:10 +01:00
4ecc050c4c Fix deadlock if NATS is not configured 2025-12-17 07:03:01 +01:00
88dc5036b3 Make import function interuptible and replace countJobs with external call to fd 2025-12-17 06:32:53 +01:00
d30c6ef3bf Make NATS API subjects configurable 2025-12-17 06:08:09 +01:00
0419fec810 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-17 05:46:10 +01:00
43e5fd1131 Add NATS API backend 2025-12-17 05:44:49 +01:00
Christoph Kluge
11e94124cc improve handling and layout if missing data in dashboard 2025-12-16 15:43:57 +01:00
Christoph Kluge
102109388b link to public dashboard in admin options, add return button do public dashboard 2025-12-16 13:54:17 +01:00
Jan Eitzinger
60a69aa0a2 Merge pull request #453 from ClusterCockpit/status_dashboard
Status dashboard
2025-12-16 10:04:49 +01:00
5e2cbd75fa Review and refactor 2025-12-16 09:45:48 +01:00
14f1192ccb Introduce central nats client 2025-12-16 09:35:33 +01:00
72b2560ecf Add progress bar for import function 2025-12-16 09:11:26 +01:00
7fce6fa401 Parallelize the Iter function in all archive backends 2025-12-16 09:11:09 +01:00
e6286768a7 Refactor variabel naming and update doc comments 2025-12-16 08:56:48 +01:00
0306723307 Introduce transparent compression for importJob function in all archive backends 2025-12-16 08:55:31 +01:00
6f49998ad3 Switch to new go tool pattern for external tool deps 2025-12-16 08:49:17 +01:00
457c944ec6 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-15 21:25:32 +01:00
33c38f9464 Fix start time in tasks 2025-12-15 21:25:30 +01:00
46351389b6 Add ai agent guidelines 2025-12-15 21:25:00 +01:00
Christoph Kluge
d56b0e93db cleanup routes, cleanup root components 2025-12-15 15:10:10 +01:00
Jan Eitzinger
f9aa47ea1c Merge pull request #450 from ClusterCockpit/dev
Dev
2025-12-15 14:42:26 +01:00
d567a5312e Add flag omitTagged to DeleteJobsBefore
Fixes #344
2025-12-15 14:38:46 +01:00
97a322354f Refactor 2025-12-15 14:06:33 +01:00
554527445b Merge branch 'master' into dev 2025-12-15 13:56:41 +01:00
Christoph Kluge
c5aff1a2ca add autorefresh, remove leftover query 2025-12-15 13:55:02 +01:00
987cc40318 Refactor 2025-12-15 13:50:05 +01:00
104fd1576a Refactor 2025-12-15 13:44:50 +01:00
72ce3954b4 feat: Add omitTagged flag for retention services
Fixes #344
2025-12-15 13:44:17 +01:00
cfa7461855 Refactor 2025-12-15 13:25:41 +01:00
44cda8a232 Add flag to obmit tagged jobs from TestFindJobsBetween 2025-12-15 13:25:22 +01:00
cf119e6843 Also initialize job-archive on init flag
Fixes #378
2025-12-15 12:59:12 +01:00
Jan Eitzinger
451744f321 Merge pull request #447 from ClusterCockpit/dependabot/go_modules/github.com/prometheus/common-0.67.4
Bump github.com/prometheus/common from 0.66.1 to 0.67.4
2025-12-15 12:34:17 +01:00
Jan Eitzinger
ca6682b94b Merge pull request #446 from ClusterCockpit/dependabot/go_modules/github.com/aws/aws-sdk-go-v2-1.40.1
Bump github.com/aws/aws-sdk-go-v2 from 1.39.6 to 1.40.1
2025-12-15 12:32:55 +01:00
dependabot[bot]
cbad2341c3 Bump github.com/aws/aws-sdk-go-v2 from 1.39.6 to 1.40.1
Bumps [github.com/aws/aws-sdk-go-v2](https://github.com/aws/aws-sdk-go-v2) from 1.39.6 to 1.40.1.
- [Release notes](https://github.com/aws/aws-sdk-go-v2/releases)
- [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json)
- [Commits](https://github.com/aws/aws-sdk-go-v2/compare/v1.39.6...v1.40.1)

---
updated-dependencies:
- dependency-name: github.com/aws/aws-sdk-go-v2
  dependency-version: 1.40.1
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-15 11:29:02 +00:00
dependabot[bot]
a956c7b135 Bump github.com/prometheus/common from 0.66.1 to 0.67.4
Bumps [github.com/prometheus/common](https://github.com/prometheus/common) from 0.66.1 to 0.67.4.
- [Release notes](https://github.com/prometheus/common/releases)
- [Changelog](https://github.com/prometheus/common/blob/main/CHANGELOG.md)
- [Commits](https://github.com/prometheus/common/compare/v0.66.1...v0.67.4)

---
updated-dependencies:
- dependency-name: github.com/prometheus/common
  dependency-version: 0.67.4
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-15 11:28:21 +00:00
Jan Eitzinger
ea6caeb2f0 Merge pull request #445 from ClusterCockpit/dependabot/go_modules/github.com/golang-migrate/migrate/v4-4.19.1
Bump github.com/golang-migrate/migrate/v4 from 4.18.2 to 4.19.1
2025-12-15 12:27:34 +01:00
Jan Eitzinger
c17e8b1156 Merge pull request #444 from ClusterCockpit/dependabot/go_modules/github.com/linkedin/goavro/v2-2.14.1
Bump github.com/linkedin/goavro/v2 from 2.14.0 to 2.14.1
2025-12-15 12:27:14 +01:00
Jan Eitzinger
b993b1e096 Merge pull request #443 from ClusterCockpit/dependabot/go_modules/github.com/go-co-op/gocron/v2-2.18.2
Bump github.com/go-co-op/gocron/v2 from 2.17.0 to 2.18.2
2025-12-15 12:24:41 +01:00
d7d81e352d Update cc-lib to v1.0.0 and fix bug in init 2025-12-15 12:20:42 +01:00
078c608bda Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-12-15 11:25:44 +01:00
f2e57f9edd Update gitignore 2025-12-15 11:25:42 +01:00
5698d5216f Reformat 2025-12-15 11:24:56 +01:00
10aa2bfbd3 Add support for ClusterConfig 2025-12-15 11:24:12 +01:00
6cfed989ff Fix bugs in sqlite backend 2025-12-15 11:23:53 +01:00
ab70acd582 Also import ClusterConfigs 2025-12-15 11:20:49 +01:00
Christoph Kluge
79e1c236fe cleanup, adapt internalDash, remove debug query value 2025-12-12 17:51:54 +01:00
Aditya Ujeniya
fed62b6c45 Functionality to configure resampling 2025-12-12 14:51:01 +01:00
Christoph Kluge
0d62181272 move roofline elements below series data render 2025-12-12 11:19:37 +01:00
Christoph Kluge
290a71bd48 Merge branch 'dev' into status_dashboard 2025-12-11 18:56:09 +01:00
Christoph Kluge
6e385db378 color roofline plot, add option to match pie and table color for ndoestate 2025-12-11 18:51:19 +01:00
Jan Eitzinger
ffe8329b84 Merge pull request #448 from ClusterCockpit/dev
Dev
2025-12-11 11:27:33 +01:00
f13be109c2 Fix: Replace all Printf log messages with appropriate loglevels 2025-12-11 11:20:11 +01:00
d24d85b970 Adapt tests to new API 2025-12-11 09:39:38 +01:00
8d44ac90ad Fix: Busywait loop in archiver and slow shutdown
Remove unblocking default in select
Add shutdown handler with context and timeout
2025-12-11 09:29:10 +01:00
Christoph Kluge
4083de2a51 Add public dashboard and route, add DoubleMetricPlot and GQL queries
- add roofline legend display switch
- small fixes
2025-12-09 10:26:55 +01:00
dependabot[bot]
131df075db Bump github.com/golang-migrate/migrate/v4 from 4.18.2 to 4.19.1
Bumps [github.com/golang-migrate/migrate/v4](https://github.com/golang-migrate/migrate) from 4.18.2 to 4.19.1.
- [Release notes](https://github.com/golang-migrate/migrate/releases)
- [Commits](https://github.com/golang-migrate/migrate/compare/v4.18.2...v4.19.1)

---
updated-dependencies:
- dependency-name: github.com/golang-migrate/migrate/v4
  dependency-version: 4.19.1
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-08 08:05:24 +00:00
dependabot[bot]
afd6f50ba2 Bump github.com/linkedin/goavro/v2 from 2.14.0 to 2.14.1
Bumps [github.com/linkedin/goavro/v2](https://github.com/linkedin/goavro) from 2.14.0 to 2.14.1.
- [Release notes](https://github.com/linkedin/goavro/releases)
- [Commits](https://github.com/linkedin/goavro/compare/v2.14.0...v2.14.1)

---
updated-dependencies:
- dependency-name: github.com/linkedin/goavro/v2
  dependency-version: 2.14.1
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-08 08:05:19 +00:00
dependabot[bot]
ad01366705 Bump github.com/go-co-op/gocron/v2 from 2.17.0 to 2.18.2
Bumps [github.com/go-co-op/gocron/v2](https://github.com/go-co-op/gocron) from 2.17.0 to 2.18.2.
- [Release notes](https://github.com/go-co-op/gocron/releases)
- [Commits](https://github.com/go-co-op/gocron/compare/v2.17.0...v2.18.2)

---
updated-dependencies:
- dependency-name: github.com/go-co-op/gocron/v2
  dependency-version: 2.18.2
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-08 08:05:11 +00:00
6325793902 Add check in initDB importer if jobMeta is nil 2025-12-04 15:38:21 +01:00
Jan Eitzinger
8ea176f9da Merge pull request #441 from ClusterCockpit/dev
Dev
2025-12-04 15:16:02 +01:00
03b5272e44 Upgrade to latest cc-lib 2025-12-04 15:08:22 +01:00
7da01975f7 archive-migration: Add check for archive version and rewrite version after migration 2025-12-04 15:08:03 +01:00
7cff8bbfd2 Add documentation for importer 2025-12-04 15:07:09 +01:00
Jan Eitzinger
c98cbb33f8 Merge pull request #440 from ClusterCockpit/dev
Dev
2025-12-04 10:46:11 +01:00
f3ea95535b Remove init functionlity from Makefile 2025-12-04 10:24:33 +01:00
b9b84b7971 Use --init flag in startDemo script 2025-12-04 07:43:55 +01:00
be7340ca30 archive-migration leanup and fix path in README.md 2025-12-04 07:43:26 +01:00
881c4566dd Reformat and remove optional ui-config 2025-12-04 07:42:55 +01:00
7efbb0217f Update config for ccbackend --init 2025-12-04 07:42:26 +01:00
9e2ce39cde Update startDemo script 2025-12-04 07:03:01 +01:00
Jan Eitzinger
0ff6cae1c3 Merge pull request #438 from rpabel/master
return directly on error
2025-12-04 06:40:43 +01:00
Jan Eitzinger
d02ba3d717 Merge pull request #439 from ClusterCockpit/dev
Dev
2025-12-04 06:38:16 +01:00
6aa830adb6 Merge dependabot and upgrade dependencies 2025-12-04 06:34:40 +01:00
be6603cbb9 Merge branch 'master' into dev 2025-12-04 06:29:48 +01:00
Jan Eitzinger
8d208929d5 Merge pull request #434 from ClusterCockpit/dependabot/go_modules/github.com/go-ldap/ldap/v3-3.4.12
Bump github.com/go-ldap/ldap/v3 from 3.4.10 to 3.4.12
2025-12-04 06:24:59 +01:00
Jan Eitzinger
cb0f96b737 Merge pull request #433 from ClusterCockpit/dependabot/go_modules/github.com/nats-io/nats.go-1.47.0
Bump github.com/nats-io/nats.go from 1.46.1 to 1.47.0
2025-12-04 06:24:26 +01:00
Jan Eitzinger
83723ab050 Merge pull request #432 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/graphql-16.12.0
Bump graphql from 16.11.0 to 16.12.0 in /web/frontend
2025-12-04 06:23:59 +01:00
Jan Eitzinger
3abaefa550 Merge pull request #430 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/rollup/plugin-commonjs-29.0.0
Bump @rollup/plugin-commonjs from 28.0.3 to 29.0.0 in /web/frontend
2025-12-04 06:23:11 +01:00
Jan Eitzinger
389010dbbd Merge pull request #429 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/chart.js-4.5.1
Bump chart.js from 4.4.9 to 4.5.1 in /web/frontend
2025-12-04 06:22:44 +01:00
81fe2c043e Upgrade to latest cc-lib 2025-12-04 06:21:28 +01:00
c76e9bb3fe Fix error in unit test 2025-12-04 06:20:45 +01:00
48b68d3410 Fix aws endpoint deprecation 2025-12-04 06:20:19 +01:00
2b64b31393 Merge branch 'ai-review' into dev 2025-12-03 15:01:18 +01:00
2333068de7 Add tools for archive manager and archive-migration
Needs testing and review
2025-12-03 14:55:22 +01:00
78530029ef Reformat 2025-12-03 14:54:48 +01:00
329b6e5640 Review refactored code 2025-12-03 14:54:09 +01:00
Christoph Kluge
967f0a3294 remove non required update trigger 2025-11-26 11:00:41 +01:00
Christoph Kluge
6eb779d359 update frontend dependencies, remove deprecated rollup options 2025-11-25 15:49:04 +01:00
Christoph Kluge
414147177a Vacuum not allowed within a migration transaction 2025-11-24 13:30:25 +01:00
Christoph Kluge
3b37f3630c add vacuum statement to migration
- reduces DB size after job table migration
2025-11-24 13:21:14 +01:00
Christoph Kluge
7c1a818582 fix outdated condition in findJobFootprintThresholds 2025-11-21 16:05:47 +01:00
Christoph Kluge
c4cf7e9707 Recreate job table indices after copy, add node table timstamp indices 2025-11-21 13:44:05 +01:00
Christoph Kluge
1ceb681521 fix missed out keyword 2025-11-20 15:58:40 +01:00
Roland Pabel
443176a0d1 return directly on error 2025-11-20 15:09:53 +01:00
Christoph Kluge
261905a364 unify out_of_memory jobState spelling
- underscores used in existing databases
2025-11-20 15:08:22 +01:00
e00288b160 Cleanup dependencies 2025-11-20 14:28:25 +01:00
f141ca926f Increase archive version. Fix unit tests. 2025-11-20 14:28:06 +01:00
f7a0954213 Fix init order. Reformat. 2025-11-20 14:26:27 +01:00
Christoph Kluge
da8d562eba fix hardcoded dev variable 2025-11-20 13:25:09 +01:00
Christoph Kluge
399af8592c switch nodeList logic to SQLite as source of truth, fix nodeList continuous scroll
- keep notindb logic for now
2025-11-20 12:18:13 +01:00
6239e7f19b Merge branch 'dev' into ai-review 2025-11-20 08:59:52 +01:00
d0e1b7186c Add perl script for CC logfile analysis 2025-11-20 07:51:33 +01:00
fea3292f50 Add idea directory to gitignore 2025-11-20 07:50:10 +01:00
9973aa9ffa Refactor api package 2025-11-20 07:48:45 +01:00
0b38a980d2 Port importer to new transaction api 2025-11-20 07:39:16 +01:00
20838b6882 Add documentation to repository package 2025-11-20 07:38:54 +01:00
8f4ef1e274 Refactor repository
Fix issues
Improve transaction API
Make hardcoded constants configurable
Make error messages consistent and always add context info
2025-11-20 06:58:45 +01:00
e1c7583670 Add sqlite and s3 job archive backend
Add documentation
Extend config
2025-11-19 17:00:11 +01:00
39a2157d46 Refactor tagger package
Fix issues
Add documentation
Add unit tests
2025-11-19 16:58:48 +01:00
dd63e7157a Refactor memorystore
Fix issues
Add unit test
Add documentation
2025-11-19 16:58:02 +01:00
340efd7926 Refactor auth package
Fix security issues
Remove redundant code
Add documentation
Add units tests
2025-11-19 16:54:01 +01:00
ecc6194b57 Refactor main package
Fix issues.
Break down main routine.
Add documentation.
Remove globals.
2025-11-19 16:53:04 +01:00
Christoph Kluge
90c3381954 add nodeState info display and filtering to systems views 2025-11-18 15:56:55 +01:00
Christoph Kluge
21334c8026 make active metrics reactive to cluster filter 2025-11-14 14:45:22 +01:00
Christoph Kluge
cbdef6ce9e fix missing rooflines in analysis heatmap plot 2025-11-14 14:02:16 +01:00
Christoph Kluge
591cd9fd66 review analysis view layout, add title with info 2025-11-14 11:28:48 +01:00
Christoph Kluge
e8d2a45afb add allocated cores gauge to status view, fix stacked labels 2025-11-14 10:40:42 +01:00
Christoph Kluge
3b533938a6 review status view components, make node states refreshable 2025-11-13 17:27:41 +01:00
Christoph Kluge
9fe342a7e9 fix metricSelect error if cluster filter is active 2025-11-13 13:40:31 +01:00
Christoph Kluge
2152ced97a improve metricplot threshold handling
- simplified and adaptive thresholds for shared jobs
2025-11-13 11:18:40 +01:00
Christoph Kluge
404be5f317 add optional legend flip to plots 2025-11-12 17:20:50 +01:00
Christoph Kluge
f56783a439 add plot cursor sync to nodelist rows 2025-11-12 16:44:49 +01:00
Christoph Kluge
fb278182d3 add schedulerState resolver 2025-11-12 13:50:09 +01:00
Christoph Kluge
c2c63d2f67 fix backend node queries
- wrong table name
- wrong scan field count: timestamp catch for log
2025-11-12 13:38:58 +01:00
Christoph Kluge
7f740455fe fix old gql field name 2025-11-12 13:09:31 +01:00
Christoph Kluge
946b992746 fix leftover dev variable 2025-11-12 12:46:00 +01:00
Christoph Kluge
a6c43e6f2f finalize timed node state frontend code for status view 2025-11-11 17:03:59 +01:00
Christoph Kluge
ecad52c18d fix: fix defautl time range select values 2025-11-06 11:15:11 +01:00
Christoph Kluge
e49e5a0474 finalize timed node state backend code, concat functions 2025-11-05 18:17:29 +01:00
dependabot[bot]
9231b3cfca Bump github.com/go-ldap/ldap/v3 from 3.4.10 to 3.4.12
Bumps [github.com/go-ldap/ldap/v3](https://github.com/go-ldap/ldap) from 3.4.10 to 3.4.12.
- [Release notes](https://github.com/go-ldap/ldap/releases)
- [Commits](https://github.com/go-ldap/ldap/compare/v3.4.10...v3.4.12)

---
updated-dependencies:
- dependency-name: github.com/go-ldap/ldap/v3
  dependency-version: 3.4.12
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:08:39 +00:00
dependabot[bot]
68e0159292 Bump github.com/nats-io/nats.go from 1.46.1 to 1.47.0
Bumps [github.com/nats-io/nats.go](https://github.com/nats-io/nats.go) from 1.46.1 to 1.47.0.
- [Release notes](https://github.com/nats-io/nats.go/releases)
- [Commits](https://github.com/nats-io/nats.go/compare/v1.46.1...v1.47.0)

---
updated-dependencies:
- dependency-name: github.com/nats-io/nats.go
  dependency-version: 1.47.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:08:35 +00:00
dependabot[bot]
1a674590bf Bump graphql from 16.11.0 to 16.12.0 in /web/frontend
Bumps [graphql](https://github.com/graphql/graphql-js) from 16.11.0 to 16.12.0.
- [Release notes](https://github.com/graphql/graphql-js/releases)
- [Commits](https://github.com/graphql/graphql-js/compare/v16.11.0...v16.12.0)

---
updated-dependencies:
- dependency-name: graphql
  dependency-version: 16.12.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:03:48 +00:00
dependabot[bot]
1ef47e7b3f Bump @rollup/plugin-commonjs from 28.0.3 to 29.0.0 in /web/frontend
Bumps [@rollup/plugin-commonjs](https://github.com/rollup/plugins/tree/HEAD/packages/commonjs) from 28.0.3 to 29.0.0.
- [Changelog](https://github.com/rollup/plugins/blob/master/packages/commonjs/CHANGELOG.md)
- [Commits](https://github.com/rollup/plugins/commits/commonjs-v29.0.0/packages/commonjs)

---
updated-dependencies:
- dependency-name: "@rollup/plugin-commonjs"
  dependency-version: 29.0.0
  dependency-type: direct:development
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:03:36 +00:00
dependabot[bot]
214a2762df Bump chart.js from 4.4.9 to 4.5.1 in /web/frontend
Bumps [chart.js](https://github.com/chartjs/Chart.js) from 4.4.9 to 4.5.1.
- [Release notes](https://github.com/chartjs/Chart.js/releases)
- [Commits](https://github.com/chartjs/Chart.js/compare/v4.4.9...v4.5.1)

---
updated-dependencies:
- dependency-name: chart.js
  dependency-version: 4.5.1
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-11-03 08:03:30 +00:00
cb5d06decd Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-10-29 09:01:05 +01:00
8555a88202 Upgrade dependencies. Regenerate GraphQL API. 2025-10-29 08:55:06 +01:00
Michael Panzlaff
2287f4493a Reapply "Fix wrong memorystore nats schema"
This reverts commit ea7660ddb3.
2025-10-28 13:17:00 +01:00
Jan Eitzinger
bb357f7cab Merge pull request #420 from ClusterCockpit/dependabot/go_modules/github.com/99designs/gqlgen-0.17.81
Bump github.com/99designs/gqlgen from 0.17.66 to 0.17.81
2025-10-28 12:19:23 +01:00
dependabot[bot]
d9b240cd2d Bump github.com/99designs/gqlgen from 0.17.66 to 0.17.81
Bumps [github.com/99designs/gqlgen](https://github.com/99designs/gqlgen) from 0.17.66 to 0.17.81.
- [Release notes](https://github.com/99designs/gqlgen/releases)
- [Changelog](https://github.com/99designs/gqlgen/blob/master/CHANGELOG.md)
- [Commits](https://github.com/99designs/gqlgen/compare/v0.17.66...v0.17.81)

---
updated-dependencies:
- dependency-name: github.com/99designs/gqlgen
  dependency-version: 0.17.81
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-28 11:17:27 +00:00
Jan Eitzinger
bea5ee96d9 Merge pull request #422 from ClusterCockpit/dependabot/go_modules/github.com/coreos/go-oidc/v3-3.16.0
Bump github.com/coreos/go-oidc/v3 from 3.12.0 to 3.16.0
2025-10-28 12:16:36 +01:00
Jan Eitzinger
7d205fd526 Merge pull request #423 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/mathjs-15.0.0
Bump mathjs from 12.4.3 to 15.0.0 in /web/frontend
2025-10-28 12:16:08 +01:00
Jan Eitzinger
c15b2a0cbb Merge pull request #424 from ClusterCockpit/dependabot/go_modules/golang.org/x/oauth2-0.32.0
Bump golang.org/x/oauth2 from 0.27.0 to 0.32.0
2025-10-28 12:15:35 +01:00
dependabot[bot]
7ccba30a3d Bump mathjs from 12.4.3 to 15.0.0 in /web/frontend
Bumps [mathjs](https://github.com/josdejong/mathjs) from 12.4.3 to 15.0.0.
- [Changelog](https://github.com/josdejong/mathjs/blob/develop/HISTORY.md)
- [Commits](https://github.com/josdejong/mathjs/compare/v12.4.3...v15.0.0)

---
updated-dependencies:
- dependency-name: mathjs
  dependency-version: 15.0.0
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-28 11:13:03 +00:00
dependabot[bot]
8091485588 Bump golang.org/x/oauth2 from 0.27.0 to 0.32.0
Bumps [golang.org/x/oauth2](https://github.com/golang/oauth2) from 0.27.0 to 0.32.0.
- [Commits](https://github.com/golang/oauth2/compare/v0.27.0...v0.32.0)

---
updated-dependencies:
- dependency-name: golang.org/x/oauth2
  dependency-version: 0.32.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-28 11:12:58 +00:00
dependabot[bot]
1413f968d6 Bump github.com/coreos/go-oidc/v3 from 3.12.0 to 3.16.0
Bumps [github.com/coreos/go-oidc/v3](https://github.com/coreos/go-oidc) from 3.12.0 to 3.16.0.
- [Release notes](https://github.com/coreos/go-oidc/releases)
- [Commits](https://github.com/coreos/go-oidc/compare/v3.12.0...v3.16.0)

---
updated-dependencies:
- dependency-name: github.com/coreos/go-oidc/v3
  dependency-version: 3.16.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-28 11:12:55 +00:00
Jan Eitzinger
d1d1bb09e9 Merge pull request #427 from ClusterCockpit/dev
Pre-Merge 1.5 dev
2025-10-28 12:11:49 +01:00
Aditya Ujeniya
3c1a7e0171 Fixed the behavior of avro write to old files 2025-10-28 09:42:28 +01:00
Jan Eitzinger
0cb50f2f01 Add warning for master branch usability
Added warning about the master branch not being production-ready.
2025-10-28 09:11:23 +01:00
Aditya Ujeniya
2287586700 Revert avro files writing logic 2025-10-28 08:53:43 +01:00
Aditya Ujeniya
ea7660ddb3 Revert "Fix wrong memorystore nats schema"
This reverts commit 856ccbb969.
2025-10-28 08:50:33 +01:00
Aditya Ujeniya
44e98e8f2f Fix to avro reader 2025-10-27 20:44:40 +01:00
Michael Panzlaff
856ccbb969 Fix wrong memorystore nats schema 2025-10-27 14:53:18 +01:00
Aditya Ujeniya
0920286b4c Clean up 2025-10-23 17:58:56 +02:00
Aditya Ujeniya
f34e10cfd9 Schema for metric store 2025-10-23 17:58:17 +02:00
ae5d202661 Remove S3Backend stub 2025-10-23 15:14:28 +02:00
bc43c844fc Fix memoryStore Init and move MetricConfig init 2025-10-20 10:22:40 +02:00
67be9aa27b Refactor
Port logging to cclog, use loglevels
Separate REST API from pkg API
2025-10-19 09:33:40 +02:00
047b997a22 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-10-18 08:30:47 +02:00
bac51891b7 Move avro into memorystore. Refactor
Does not compile
2025-10-18 08:30:42 +02:00
Christoph Kluge
714d6af7cd initial branch commit, improve countstate backend logic
- stacked component rough sketch
- gql data request pipeline layed out
2025-10-17 18:24:05 +02:00
6efd6334bb Fix unit tests 2025-10-17 07:06:31 +02:00
91f4475d76 Update test db 2025-10-17 07:05:45 +02:00
Christoph Kluge
de309784b4 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-10-16 15:33:59 +02:00
Christoph Kluge
a623cf53f3 revert leftover notfoundhandler experiments 2025-10-16 15:33:56 +02:00
440cd59e50 Revert hpc_cluster to cluster. Refactor. 2025-10-16 14:32:06 +02:00
eefb6f6265 Cleanup after merge 2025-10-16 13:21:22 +02:00
f5e1226837 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-10-16 13:01:35 +02:00
151f7e701f Disable userConfig unit test 2025-10-16 12:54:29 +02:00
40398497c2 Update Node table code. Add simple unit test 2025-10-16 12:54:16 +02:00
Christoph Kluge
cda10788fb adapt migrated indices to new database structure, include node tables, update job indices 2025-10-15 10:46:24 +02:00
Christoph Kluge
845905d9c8 remove inspect commands for dev 2025-10-15 10:35:35 +02:00
89055506d6 Revert changes to ui config init 2025-10-15 08:54:16 +02:00
Christoph Kluge
5908ae7905 adapt status node query resolution to new node_state table 2025-10-14 18:45:05 +02:00
Christoph Kluge
4131665284 remove gql auto comment 2025-10-14 18:43:16 +02:00
Christoph Kluge
6a43dfb0d7 Fix missing model.Aggregate entry, fix status queries and refresh 2025-10-14 18:43:00 +02:00
3d38d78845 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-10-13 16:13:53 +02:00
600f19ac80 Sync commit
Does not work yet
2025-10-13 16:12:02 +02:00
Christoph Kluge
0a3a664653 small fixes, set analysisView config defaults 2025-10-09 16:33:14 +02:00
Christoph Kluge
471ec1cd2e change deprecated defaultMetrics loader to new confkey
- see PR #333
2025-10-08 18:24:41 +02:00
Christoph Kluge
e296cd7ca0 add web init with uiconfig file path, add cli flag 2025-10-08 16:25:50 +02:00
Christoph Kluge
31cfa8cd7c fix typo for tagEditDisplay 2025-10-08 12:58:02 +02:00
Christoph Kluge
70fe8aa367 fix systemsView config laod and mutation, fix metricSelection checked 2025-10-07 15:46:16 +02:00
Christoph Kluge
cc9dafac6f fix sq.Update call 2025-10-02 18:10:58 +02:00
Christoph Kluge
32429f1481 adapt frontend for new uiConfig keys, add nodeOverview mutation 2025-10-02 18:10:33 +02:00
9485a463b8 Refactor node repository 2025-09-30 10:07:07 +02:00
35c6ab4a08 Ongoing work on node table
Sync commit: Does not compile
2025-09-30 10:06:19 +02:00
e58b0fa015 Add ui config tests and fix bugs 2025-09-30 09:01:54 +02:00
beb92967e5 Update nodestate API and db adapter 2025-09-28 08:26:44 +02:00
015583f1cd Add incremental configuration 2025-09-28 08:26:18 +02:00
d40c54b802 Refactor 2025-09-28 08:24:41 +02:00
647665b6b9 Refactor 2025-09-28 08:24:12 +02:00
4fc78bc382 Refactor variable namings and doc comments 2025-09-27 09:27:36 +02:00
50d000e7e2 Implement UI config handling 2025-09-27 09:26:42 +02:00
Jan Eitzinger
ad500c4bef Merge pull request #416 from ClusterCockpit/add_uiconfig_schema
Add uiconfig schema
2025-09-26 13:38:25 +02:00
Jan Eitzinger
916077c6f8 Merge branch 'dev' into add_uiconfig_schema 2025-09-26 13:27:18 +02:00
Christoph Kluge
935fb238a4 add init context to nodeOverview, add additional key for plot rerender 2025-09-10 18:01:33 +02:00
Christoph Kluge
d03e5b4562 handle metric disabled state explicitly in nodeOverview component 2025-09-10 15:42:13 +02:00
Christoph Kluge
05c45c6468 fix: add missing kes to node overview, solves load to empty overview 2025-09-10 15:31:39 +02:00
9020613a8b Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-09-10 15:14:40 +02:00
be92d5943d Decrease verbosity in jobcache sync task 2025-09-10 15:13:53 +02:00
Aditya Ujeniya
b2368a0751 Connectivity to CCMS feature readded 2025-09-10 14:23:18 +02:00
7948d5f773 Use different demo job-archive for dev branch 2025-09-10 10:34:11 +02:00
Jan Eitzinger
1a16851ad0 Merge pull request #401 from ClusterCockpit/dependabot
Add Dependabot for version updates
2025-09-10 09:18:13 +02:00
Jan Eitzinger
810c14a839 Merge pull request #405 from ClusterCockpit/metricstore
Metricstore Integration
2025-09-10 09:17:33 +02:00
Jan Eitzinger
df0e8eb228 Merge branch 'dev' into metricstore 2025-09-10 09:14:50 +02:00
79605c8a9e Update test pipeline to go 1.25 2025-09-10 09:08:32 +02:00
Aditya Ujeniya
9b644119ae Fix to testdata database 2025-09-09 18:34:10 +02:00
Christoph Kluge
ffa9919019 Merge pull request #403 from ClusterCockpit/rework_status_view
Rework status view

As discussed in office: Tests will be fixed in dev branch
2025-09-09 17:37:26 +02:00
55ca892f90 Merge branch 'metricstore' of github.com:ClusterCockpit/cc-backend into metricstore 2025-09-09 15:04:33 +02:00
eaca187032 Fix testdata for new schema 2025-09-09 15:04:25 +02:00
Aditya Ujeniya
3b9d05cc6d Fix exclusive to shared in svlete and graphql 2025-09-09 14:57:05 +02:00
d00881de2e Refactor and update dependencies 2025-09-09 11:36:02 +02:00
d8e85cf75d Fix migration 2025-09-09 11:35:34 +02:00
39f21763e4 Revert test database 2025-09-09 11:30:20 +02:00
Aditya Ujeniya
af43901ca3 Trial and Test MetricStore components 2025-09-08 22:54:13 +02:00
Aditya Ujeniya
62565b9ae2 Combined metricstore api and functions 2025-09-08 11:29:27 +02:00
Aditya Ujeniya
bca176170c Migration SQL fix 2025-09-03 08:22:15 +02:00
Christoph Kluge
2a91ca0cff Merge branch 'dev' into rework_status_view 2025-08-13 14:29:19 +02:00
Christoph Kluge
19a75554b0 remove outdated components 2025-08-13 14:23:19 +02:00
Christoph Kluge
58ae476a3e move and add interface options for status tabs 2025-08-13 14:22:24 +02:00
Christoph Kluge
44d8254a0b fix layouting 2025-08-12 17:57:04 +02:00
Christoph Kluge
bd2cdfcef2 reorganize plots, reduce tabs, 2025-08-12 17:04:31 +02:00
a50b832c2a Import metric store packages 2025-08-08 14:24:52 +02:00
Christoph Kluge
10194105e3 fix color overflow, add info if no status data 2025-08-08 13:50:09 +02:00
Christoph Kluge
b474288df7 add cbmode to piecharts
- old default colorscheme is now cb colorscheme
2025-08-07 18:20:34 +02:00
Christoph Kluge
f338209f32 rename new roofline compnent 2025-08-07 16:28:35 +02:00
Christoph Kluge
bef832e45b Build new statusDash, refine newRoofline data render 2025-08-07 16:10:11 +02:00
Christoph Kluge
71cfb4db77 fix: fix metric availability subcluster list overflow 2025-08-05 14:19:03 +02:00
86453e7e11 Port to new job structs
Backup commit: Does not build.
2025-08-05 10:23:54 +02:00
Christoph Kluge
98b9f8e62d Add more information to status dash 2025-08-04 14:50:53 +02:00
44cd8d258d Fix and regenerate Swagger and GraphQL 2025-07-31 12:10:46 +02:00
764b65d094 Add timestamp column to node table 2025-07-31 12:10:01 +02:00
Christoph Kluge
4d2c64b012 remove logging 2025-07-23 15:00:10 +02:00
Christoph Kluge
35c0b0be58 add scheduler and health status pie charts 2025-07-21 16:03:07 +02:00
Christoph Kluge
7a54e2cfb3 add required and minItems flags to uiConfigSchema 2025-07-21 11:37:05 +02:00
Christoph Kluge
54283f6d3c add schema definition for uiConfig 2025-07-21 11:21:54 +02:00
Christoph Kluge
697acd1d88 Extend bubbleRoofline for nodeData, add column to node table, rename nodeStats query 2025-07-18 18:12:07 +02:00
Christoph Kluge
5cdb80b4d6 cleanup intends, add transparency switch to path renderer 2025-07-15 18:49:23 +02:00
Christoph Kluge
e48ff8be73 change bubble render parameters
- Note: data points are hover highlighted by tooltip
2025-07-15 16:36:12 +02:00
Christoph Kluge
096217eea6 cleanup bubbleRoofline code, comment optional code parts 2025-07-15 16:00:55 +02:00
Christoph Kluge
ed5290be86 adds new roofline component for job average based data
- clickable, resource sized and duration colored bubbles
2025-07-14 18:12:34 +02:00
Christoph Kluge
b036c3903c add config fallbacks and notes 2025-07-10 14:57:12 +02:00
Christoph Kluge
57b43b7b60 Split status view into tabbed components 2025-07-07 18:44:24 +02:00
ab1ddb7bd1 Refactor 2025-07-07 14:29:06 +02:00
881f2f32f4 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-07-07 13:09:16 +02:00
0754ba5292 Port configuration to ccConfig scheme
Decentralize config validation
Modularize configuration handling
2025-07-07 13:09:12 +02:00
Christoph Kluge
743a89c3a2 Finalize node query backend functions, fix migration issue 2025-07-04 15:14:15 +02:00
Christoph Kluge
6692c3ab7c add indices for new node table and tags 2025-07-03 15:07:05 +02:00
Thomas Gruber
c16a5fdac4 Create dependabot.yml 2025-07-03 14:46:04 +02:00
Christoph Kluge
60ec7e54f5 Update component header, format, streamline SV5 components 2025-07-02 18:43:25 +02:00
dd48f5ab87 fix: Optimize sqlite settings 2025-07-02 09:12:07 +02:00
Christoph Kluge
db674ec31d Migrate RooflineHM and Scatter components
- With this commit, all SV4 components are migrated to SV5
2025-07-01 18:05:53 +02:00
Christoph Kluge
48150ffc8b Migrate Pie and Polar components 2025-07-01 17:25:52 +02:00
Christoph Kluge
1ad80efab6 Migrate Histogram and Roofline components 2025-07-01 16:33:07 +02:00
Christoph Kluge
aa8789f8f8 Migrate MetricPlot component 2025-07-01 15:50:45 +02:00
Christoph Kluge
56e3f2da5c Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-07-01 15:48:39 +02:00
Christoph Kluge
a4104822e2 format cleanup component arguments 2025-07-01 15:48:30 +02:00
Jan Eitzinger
c13f386e3b Merge pull request #399 from ClusterCockpit/port-to-cclib
Port to cclib
2025-06-30 13:22:22 +02:00
4bd73450b5 Temporary disable archive clean test 2025-06-30 13:00:07 +02:00
64da28e814 Merge branch 'dev' into port-to-cclib 2025-06-30 12:09:28 +02:00
639e1b9c6d Port to cc-lib. Extend legal header. 2025-06-30 12:06:35 +02:00
Christoph Kluge
63e828d2df Commentout dev logging 2025-06-27 18:49:19 +02:00
Christoph Kluge
b8c30b5703 Fix continuous scroll in sv5 joblist, rework joblist logic 2025-06-27 18:42:18 +02:00
Christoph Kluge
805ea91fc2 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-06-27 15:52:57 +02:00
Christoph Kluge
c4c422da57 Migrate jobList and jobListRow 2025-06-27 15:52:54 +02:00
544fb35121 Merge branch 'dev' into port-to-cclib 2025-06-27 14:15:38 +02:00
43edccb284 Add enable jobtagger options. Reformat. 2025-06-27 14:11:37 +02:00
7531ba4b5c Refine app detection
Switch to regexp
2025-06-27 14:11:10 +02:00
983aa592d8 refine highload rule 2025-06-27 12:16:17 +02:00
8378784231 Enclose terms by spaces in app detection 2025-06-27 12:16:06 +02:00
dca25cc601 Saveguard changes to archive 2025-06-27 12:15:42 +02:00
Christoph Kluge
c8fe81cd80 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-06-27 11:43:53 +02:00
c0a4724f57 Add matlab app type and fix tests 2025-06-27 11:31:43 +02:00
484c52d813 Also update job archive on tag deletion 2025-06-27 11:20:22 +02:00
Christoph Kluge
47843b2087 Optimize jobview gql query load 2025-06-27 11:15:17 +02:00
Christoph Kluge
c3a6126799 Migrate and rework job view metricplot wrapper 2025-06-26 18:41:27 +02:00
Christoph Kluge
e94b250541 Migrate nodeList subcomponents 2025-06-26 12:29:48 +02:00
Christoph Kluge
db5f6c7540 Migrate plotgrid, adapt parent components with new snippets 2025-06-25 18:19:24 +02:00
Christoph Kluge
79a6c9e90d Migrate Job View 2025-06-25 17:41:11 +02:00
e2e67e3977 Merge branch 'migrate_svelte5' into dev 2025-06-24 06:53:18 +02:00
6c06450701 Add more tagger rules 2025-06-24 06:52:21 +02:00
Christoph Kluge
d7379a1af2 Migrate jobView components 2025-06-20 18:14:36 +02:00
Christoph Kluge
d731611e0c Migrate single node view, fix route condition 2025-06-20 17:47:06 +02:00
Christoph Kluge
dceb92ba8e Migrate jobCompare and comparison plot 2025-06-20 15:20:26 +02:00
Christoph Kluge
1e039cb1bf Migrate select components and adapt parents 2025-06-18 18:14:56 +02:00
6f3e1ffbe3 Add ressource ounts to node table 2025-06-18 13:02:11 +02:00
Christoph Kluge
6a6dca3fce Migrate config, migrate analysis plotselection 2025-06-16 17:09:02 +02:00
Christoph Kluge
d6d92071bf fix: remove unnecessary bind, correct page item minimum 2025-06-16 13:04:33 +02:00
Christoph Kluge
d40657dc64 Migrate pagination and jobinfo 2025-06-13 17:05:07 +02:00
Christoph Kluge
6dde2a1e59 Migrate JobSummary and subcomponents 2025-06-13 15:49:51 +02:00
Christoph Kluge
b7823cec16 Migrate header components 2025-06-13 14:46:09 +02:00
Christoph Kluge
eabd7b8d51 Remove unused component 2025-06-13 14:40:07 +02:00
Christoph Kluge
27ec445e54 Small migrations and added migration note 2025-06-13 14:39:55 +02:00
Christoph Kluge
ad108b285f fix continuous scroll next page logic error 2025-06-12 17:20:22 +02:00
Christoph Kluge
f471214ef7 migrate system view, node list and node overview 2025-06-12 16:23:31 +02:00
Christoph Kluge
a0190f8f40 Merge branch 'dev' into migrate_svelte5 2025-06-10 10:02:58 +02:00
82af984023 Implement part of Node query GraphQL callbacks 2025-06-06 17:32:09 +02:00
0373010497 Refactor and fix tagger test 2025-06-06 16:41:48 +02:00
Christoph Kluge
c22d869aa7 Move form to cardbody instead of classing 2025-06-06 16:17:42 +02:00
87c93e90cd Implement node query 2025-06-06 16:04:53 +02:00
3d6dca9386 Add more apps for tagger 2025-06-06 16:04:37 +02:00
Christoph Kluge
f946e7e6ab fix: fix issues after updated dev branch merge 2025-06-06 13:43:13 +02:00
Christoph Kluge
d50dfa5867 Update frontend dependencies: rollup and svelte 2025-06-06 11:14:37 +02:00
249128e011 Cleanup. Re-generate Swagger 2025-06-06 06:30:40 +02:00
ca16a80b1f Add info logging to node repo 2025-06-06 06:12:02 +02:00
Christoph Kluge
e789e7ba9b fix missing state declarations 2025-06-05 18:08:16 +02:00
Christoph Kluge
5048f7be14 Merge branch 'dev' into migrate_svelte5 2025-06-05 17:56:48 +02:00
Christoph Kluge
0e3603f596 fix: layout issues in jobList toolbar 2025-06-05 17:47:03 +02:00
9cd4b3c1cc Convert to all lower case 2025-06-05 16:20:48 +02:00
1d9aa75960 Add determine nodestate routine 2025-06-05 16:15:40 +02:00
Christoph Kluge
0a24ef70e0 fix: fix joblist continuous scroll buildup when refreshing 2025-06-05 15:19:00 +02:00
3b5d3d671e Refactor 2025-06-05 14:27:26 +02:00
7db83d216e Start implementing nodestate rest api 2025-06-05 14:27:21 +02:00
d1a7002422 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-06-05 13:23:39 +02:00
1d8e7e072f Refactor rest api 2025-06-05 13:23:36 +02:00
7466fe7a34 Update GraphQL schema. Refactor node repository 2025-06-05 13:17:24 +02:00
Christoph Kluge
24cf5047da Migrate tags view 2025-06-05 10:51:32 +02:00
Christoph Kluge
1f103e5ef5 Migrate status view 2025-06-05 10:43:44 +02:00
Christoph Kluge
9e87974eb1 Fix compareTable sorting 2025-06-04 17:02:02 +02:00
Christoph Kluge
d806cf76c4 Fix config warning and key name 2025-06-04 16:23:54 +02:00
Christoph Kluge
6e2703998d Migrate jobTag management 2025-06-04 14:45:31 +02:00
6f9737c2c2 Add node repository, extend GraphQL API
Sync commit.
2025-06-04 13:44:37 +02:00
Christoph Kluge
5e696c10d5 Migrate job view stats table 2025-06-04 11:28:45 +02:00
Christoph Kluge
927e25c72c Migrate metricSelection 2025-06-03 13:32:14 +02:00
8b1b99ba35 feat: Add requested memory to job meta data
Fixes #110
2025-06-03 07:16:19 +02:00
2c102cd1ff Fix error in node table migration 2025-06-03 06:55:49 +02:00
Christoph Kluge
42c4926c47 fix refresher sv5 logic 2025-06-02 14:20:32 +02:00
Christoph Kluge
703556d893 Migrate user list and analysis view 2025-06-02 13:51:15 +02:00
Christoph Kluge
0b529a5c3c Migrate and fix filter component and subcomponents 2025-06-02 13:00:47 +02:00
Jan Eitzinger
5186b3f61e Merge pull request #398 from ClusterCockpit/Refactor-job-struct
Refactor job struct
2025-06-02 12:13:43 +02:00
4dc0da5099 Add node table schema 2025-06-02 12:07:01 +02:00
1bad6ba065 Regenerate GraphQL interface 2025-05-28 16:00:47 +02:00
3efee22536 Remove jobMeta and use job struct everywhere 2025-05-28 15:59:21 +02:00
eef48ac3a3 Small fix in highload rule 2025-05-28 14:33:52 +02:00
e35cfbc3dd Refactor 2025-05-28 14:32:56 +02:00
4a5fd96b32 Adapt job class rules 2025-05-28 14:32:49 +02:00
Jan Eitzinger
bdffe73f59 Merge pull request #397 from ClusterCockpit/134-job-tagging
134 job tagging
2025-05-27 13:14:50 +02:00
cdfe722457 Include metric thresholds in rule environment
Not yet tested
2025-05-27 13:02:13 +02:00
0aecea6de2 Refactor. Add Subcluster get metric list helper routine. 2025-05-27 09:23:28 +02:00
5a88c77171 Remove debug output 2025-05-26 14:42:41 +02:00
8003217092 Add string to gromacs app file 2025-05-26 14:41:02 +02:00
9b325041c1 Fix typo in jobCache columns 2025-05-26 14:30:30 +02:00
1e7fbe5d56 Refactor 2025-05-26 13:40:34 +02:00
0261c263f9 Add hint message only if rule matches 2025-05-26 13:36:23 +02:00
8d6ae85b0d Fix bug with job columns 2025-05-26 13:26:18 +02:00
f14bdb3068 Fix bugs in job classifier and tagger infrastructure 2025-05-26 13:08:03 +02:00
3c66840f95 Add tagger config option and command line switch to run taggers on all jobs 2025-05-23 10:13:59 +02:00
733e3ea9d5 Revert interface from jobMeta to job type. Extend job classifier tagger. Cleanup test rules. 2025-05-23 07:48:27 +02:00
ca634bb707 Refactor taggers. Refine Job Hooks. Start job classifier 2025-05-22 07:10:41 +02:00
9abc206d1a Read in tagger config on startup. Safeguard watcher shutdown 2025-05-20 07:10:15 +02:00
85f17c0fd8 Refactor Tagger package. Add fsNotify Service 2025-05-19 16:08:43 +02:00
14bad81b9f Extend Job Hooks and add unit tests
Add job tagger control
2025-05-19 13:25:39 +02:00
Christoph Kluge
ffd596e2c7 Migrate job list view and filter components
- filters now inactive in user jobs, lists and analysis due to missing dispatch
2025-05-19 09:25:23 +02:00
99f8187092 Port tests to new architecture 2025-05-19 09:17:16 +02:00
f30b784f45 Attempt to fix api test
Tests still fail
2025-05-16 17:38:00 +02:00
f06b5f8fc0 Refactor 2025-05-16 17:37:36 +02:00
2e781b900d Staged error handling for job cache 2025-05-16 17:37:24 +02:00
d76b1ae75d feat: add job commit service
Sync jobs from job cache table to main job table.
Enables #392
2025-05-16 17:36:33 +02:00
40110580e0 feat: add job hook support
Fixes #394
2025-05-16 17:33:44 +02:00
eab7961a83 Introduce caching table for faster job inserts
Fixes #392
2025-05-16 17:32:19 +02:00
432e06e801 Add GoString method for jobmeta 2025-05-16 17:19:56 +02:00
fe1ff5c7a3 Update tests from dev 2025-05-16 07:33:33 +02:00
6e66b8e08b Merge branch 'dev' into 134-job-tagging 2025-05-16 07:26:00 +02:00
7abdd0545e Add api for tag handling within cc-backend 2025-05-16 07:24:24 +02:00
Christoph Kluge
3f1768e467 Merge branch 'dev' into migrate_svelte5 2025-05-14 17:06:30 +02:00
Christoph Kluge
f464921ae3 fix: fix user view filter job count 2025-05-14 17:05:58 +02:00
Christoph Kluge
7603ad3fb0 Polish and Format rollup config for svelte5 2025-05-14 11:41:11 +02:00
Christoph Kluge
be7ccc78b8 Update packages, ignore sveltestrap related warnings on compile 2025-05-14 11:02:48 +02:00
Christoph Kluge
b3135c982f Merge latest state branch 'dev' into migrate_svelte5 2025-05-13 18:25:54 +02:00
13386175f5 Merge branch 'dev' into 134-job-tagging 2025-05-13 14:48:58 +02:00
23e8f3dc2d Port to godotenv library
Fixes #376
2025-05-13 14:46:01 +02:00
Jan Eitzinger
b323ce2eef Merge pull request #391 from ClusterCockpit/add_job_comparison
Add job comparison
2025-05-13 14:18:22 +02:00
Jan Eitzinger
08e323ba51 Merge pull request #390 from ClusterCockpit/dependabot/go_modules/golang.org/x/net-0.38.0
Bump golang.org/x/net from 0.36.0 to 0.38.0
2025-05-13 14:12:44 +02:00
dependabot[bot]
9f50f36b1d Bump golang.org/x/net from 0.36.0 to 0.38.0
Bumps [golang.org/x/net](https://github.com/golang/net) from 0.36.0 to 0.38.0.
- [Commits](https://github.com/golang/net/compare/v0.36.0...v0.38.0)

---
updated-dependencies:
- dependency-name: golang.org/x/net
  dependency-version: 0.38.0
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-05-13 12:10:40 +00:00
Christoph Kluge
4399c1d590 Add metric units to compareTable head, format metric subheads 2025-05-12 11:39:45 +02:00
Christoph Kluge
f7376f6dca Reduce tick spacing in comparePlots 2025-05-09 17:56:20 +02:00
Christoph Kluge
518cb34340 Add sorting to compareTable 2025-05-09 17:07:39 +02:00
Christoph Kluge
f210a5f508 Remove refresh and textfilter from compareView, force id filters only on compare 2025-05-09 14:36:54 +02:00
Christoph Kluge
9ebc49dd1c add table to compareview, remove debug data view 2025-05-08 15:21:05 +02:00
Christoph Kluge
c119eeb468 Prevent high job counts in compare view by filter removal 2025-05-08 11:28:13 +02:00
Christoph Kluge
ab616f8f79 Fix JobCompare Labelling and Rerender 2025-05-08 10:48:30 +02:00
Christoph Kluge
69286881e4 add manual job selection for comparison in jobs view 2025-05-08 09:28:48 +02:00
Christoph Kluge
4419df8d1b add cluster and subcluster information to compareplot 2025-05-06 18:08:35 +02:00
Christoph Kluge
aed2bd48fc add resource compare graph, add cursor sync, handle jobIds fitler 2025-05-06 17:54:13 +02:00
Christoph Kluge
d3d752f90c finalize compareplot prototype, move formattime to units.js 2025-05-06 10:46:30 +02:00
Christoph Kluge
33ecfe88ef add job duration, add starttime and duration to legend 2025-05-06 09:58:28 +02:00
Christoph Kluge
fd52fdd35b add job starttime to legend 2025-05-05 16:41:05 +02:00
Christoph Kluge
1d13d3dccf add and integrate job comparison plot component 2025-05-05 11:26:39 +02:00
Christoph Kluge
1c84bcae35 add filterBuffer for seamless view switch 2025-04-29 18:40:44 +02:00
Christoph Kluge
df497d5952 initial branch commit, add job compare switch, add gql resolver 2025-04-29 15:10:06 +02:00
Jan Eitzinger
f65e122f8d Merge pull request #386 from ClusterCockpit/hotfix
Prepare re-release for v1.4.4
2025-04-28 10:18:44 +02:00
161f0744aa fix: enforce apiAllowedIPs config option
Fixes #385
2025-04-28 09:54:22 +02:00
95de9ad3b3 Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix 2025-04-28 08:52:27 +02:00
Jan Eitzinger
d5c170055f Merge pull request #384 from brinkcoder/fix/auth-log-iperr
[BUGFIX] correct wrong variable in AuthApi error logging
2025-04-28 08:51:42 +02:00
brinkcoder
61f0521072 fix: correct logging variable from err to ipErr in AuthApi 2025-04-25 22:37:16 +02:00
Christoph Kluge
6ca14c55f2 fix: fix error in jobsMetricStatisticsHistogram calculation
- also reduces overhead, simplifies query
2025-04-25 18:09:21 +02:00
Jan Eitzinger
1309d09aee Merge pull request #383 from ClusterCockpit/hotfix
Remove websocket sse GraphQL support
2025-04-24 12:59:34 +02:00
aba75b3a19 Remove websocket sse GraphQL support 2025-04-24 12:57:37 +02:00
Jan Eitzinger
e87481d8db Merge pull request #382 from ClusterCockpit/hotfix
Prepare Bugfix Release 1.4.4
2025-04-24 11:46:25 +02:00
acaad69917 Prepare Bugfix Release 1.4.4 2025-04-24 11:42:34 +02:00
Jan Eitzinger
ff588ad57a Merge pull request #381 from ClusterCockpit/dev
Dev
2025-04-24 11:18:55 +02:00
65df27154c Cleanup and regenerate Swagger docs 2025-04-24 11:14:51 +02:00
8dfa1957f4 Merge hotfix changes 2025-04-24 11:07:02 +02:00
570eba3794 Cleanup Swagger docs 2025-04-24 11:01:13 +02:00
94a39fc61f Readd tag endpoints 2025-04-24 10:53:55 +02:00
2d359e5f99 Merge rest.go 2025-04-24 10:40:03 +02:00
Jan Eitzinger
04692e0c44 Merge pull request #379 from ClusterCockpit/add_tag_delete
Add Tag Deletion: API and Frontend
2025-04-24 10:09:51 +02:00
Jan Eitzinger
809fd23b88 Merge pull request #380 from ClusterCockpit/review_api_auth
Review api auth
2025-04-24 10:08:18 +02:00
Christoph Kluge
e3653daea3 reduce code in tag svelte view 2025-04-23 17:59:26 +02:00
Christoph Kluge
48fa75386c feat: add tag removal api endpoints 2025-04-23 16:12:56 +02:00
Christoph Kluge
1b3a12a4dc feat: add remove functionality to tag view, add confirm alert 2025-04-23 15:01:12 +02:00
Christoph Kluge
543ddf540e implement removeTagFromList mutation, add tag mutation access checks 2025-04-23 14:51:01 +02:00
Christoph Kluge
a3fb471546 adapt and improve svelte taglist component 2025-04-22 17:33:17 +02:00
Christoph Kluge
277f964b30 move taglist a from go tmpl to svelte component 2025-04-22 13:47:25 +02:00
Christoph Kluge
9bcf7adb67 add api calls for removing tags, initial branch commit 2025-04-17 17:31:59 +02:00
Christoph Kluge
f343fa0071 fix: add name scrambling demo mode to all views
- was missing for analysis, status and nodelist
2025-04-17 11:15:35 +02:00
Christoph Kluge
e5862e9218 Merge branch 'dev' of https://github.com/ClusterCockpit/cc-backend into dev 2025-04-16 18:36:15 +02:00
Christoph Kluge
29ae2423f8 fix metricconfig pointer copy, add disabled metric card in jobView
- skips disabled metrics in backend, see cc-backend tries to retrieve "removed" metrics #377
2025-04-16 18:36:12 +02:00
Christoph Kluge
1755a4a7df remove separate userapiallowedips config and check 2025-04-14 11:58:42 +02:00
Christoph Kluge
25d3325049 add getUsers to admin REST api 2025-04-14 11:36:03 +02:00
Christoph Kluge
fb6a4c3b87 review and move api endpoints secured check 2025-04-09 16:00:27 +02:00
317f80a984 fix: Replace deprecated gqlgen NewDefaultServer call 2025-04-09 09:40:52 +02:00
28cdc1d9e5 fix: Update endpoints in Swagger UI 2025-04-09 09:13:21 +02:00
c2087b15d5 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-04-09 07:28:02 +02:00
a8d785beb3 Remove redundant check in auth package 2025-04-09 07:27:59 +02:00
Christoph Kluge
a6784b5549 fix: reintroduce statstable id natural sort order
- see Use natural sort order for IDs in statistics tables #369
2025-04-08 16:00:07 +02:00
Christoph Kluge
d770292be8 feat: add nodename matcher select to filter, defaults to equal match
- see PR !353
2025-04-08 14:52:07 +02:00
Christoph Kluge
b3a1037ade Merge pull request #353 from brinkcoder/fix-node-filter
Fix node filter to use EXISTS for exact hostname matches
2025-04-08 12:57:04 +02:00
Christoph Kluge
02946cf0b4 fix: fix nodelist filter result displaying wrong information
- missing svelte iteration key added
2025-04-07 17:03:23 +02:00
Christoph Kluge
cf051d5108 Merge pull request #375 from ClusterCockpit/master
Dependabot Update Dev Branch
2025-04-07 16:09:31 +02:00
Christoph Kluge
96977c6183 Merge pull request #374 from ClusterCockpit/review_logging
Review logging
2025-04-07 16:03:48 +02:00
Jan Eitzinger
73d83164fc Merge pull request #373 from ClusterCockpit/dependabot/go_modules/golang.org/x/net-0.36.0
Bump golang.org/x/net from 0.35.0 to 0.36.0
2025-04-04 11:05:01 +02:00
dependabot[bot]
1064f5e4a8 Bump golang.org/x/net from 0.35.0 to 0.36.0
Bumps [golang.org/x/net](https://github.com/golang/net) from 0.35.0 to 0.36.0.
- [Commits](https://github.com/golang/net/compare/v0.35.0...v0.36.0)

---
updated-dependencies:
- dependency-name: golang.org/x/net
  dependency-version: 0.36.0
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-04-04 09:01:59 +00:00
Jan Eitzinger
5be98c7087 Merge pull request #372 from ClusterCockpit/dependabot/npm_and_yarn/web/frontend/babel/runtime-7.27.0
Bump @babel/runtime from 7.26.0 to 7.27.0 in /web/frontend
2025-04-04 10:55:34 +02:00
dependabot[bot]
0d689c7dff Bump @babel/runtime from 7.26.0 to 7.27.0 in /web/frontend
Bumps [@babel/runtime](https://github.com/babel/babel/tree/HEAD/packages/babel-runtime) from 7.26.0 to 7.27.0.
- [Release notes](https://github.com/babel/babel/releases)
- [Changelog](https://github.com/babel/babel/blob/main/CHANGELOG.md)
- [Commits](https://github.com/babel/babel/commits/v7.27.0/packages/babel-runtime)

---
updated-dependencies:
- dependency-name: "@babel/runtime"
  dependency-version: 7.27.0
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-04-04 08:45:16 +00:00
Jan Eitzinger
1f24ed46a0 Merge pull request #371 from ClusterCockpit/dependabot/go_modules/github.com/golang-jwt/jwt/v5-5.2.2
Bump github.com/golang-jwt/jwt/v5 from 5.2.1 to 5.2.2
2025-04-04 10:37:18 +02:00
dependabot[bot]
92b4159f9e Bump github.com/golang-jwt/jwt/v5 from 5.2.1 to 5.2.2
Bumps [github.com/golang-jwt/jwt/v5](https://github.com/golang-jwt/jwt) from 5.2.1 to 5.2.2.
- [Release notes](https://github.com/golang-jwt/jwt/releases)
- [Changelog](https://github.com/golang-jwt/jwt/blob/main/VERSION_HISTORY.md)
- [Commits](https://github.com/golang-jwt/jwt/compare/v5.2.1...v5.2.2)

---
updated-dependencies:
- dependency-name: github.com/golang-jwt/jwt/v5
  dependency-version: 5.2.2
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-04-04 08:35:15 +00:00
Jan Eitzinger
5817b41e29 Merge pull request #368 from ClusterCockpit/dev
Dev
2025-03-20 13:02:23 +01:00
d6b132e3a6 Merge branch 'master' into dev 2025-03-20 12:51:23 +01:00
Jan Eitzinger
318f70f34c Merge pull request #365 from ClusterCockpit/split_statsTable_query
Split StatsTable DataQuery from JobMetrics Query In Job-View
2025-03-20 12:50:23 +01:00
Jan Eitzinger
e41525d40a Merge pull request #366 from ClusterCockpit/hotfix
fix: always return hasNextPage boolean to frontend
2025-03-20 12:49:57 +01:00
Jan Eitzinger
a102220e52 Merge pull request #367 from ClusterCockpit/makefile-fix
Fix 'make -B', don't fail if $(VAR) already exists
2025-03-20 12:47:16 +01:00
Christoph Kluge
e9a214c5b2 fix: add nullSafe condition to monitoringStatus display on metric queryError 2025-03-19 14:57:27 +01:00
Christoph Kluge
c53f5eb144 fix: always return hasNextPage boolean to frontend
- removes dependency on uiDefaults setting
2025-03-18 18:01:37 +01:00
Christoph Kluge
9ed64e0388 Review logging, comment cleanup 2025-03-17 17:39:17 +01:00
Christoph Kluge
93040d4629 IMplement LoadNode Data, LoadNodeListData, LoadScopedStats for influxDB2 backend
- Untested
- Only Node Scope
2025-03-17 15:25:33 +01:00
Christoph Kluge
0144ad43f5 Implement NodeListData and ScopedStats for Prometheus Backend 2025-03-17 11:03:51 +01:00
Christoph Kluge
8da2fc30c3 split statsTable data from jobMetrics query, frontend refactor 2025-03-14 16:36:31 +01:00
0e27ae7795 Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev 2025-03-14 10:52:39 +01:00
33c6cdb9fe Update test workflow 2025-03-14 10:52:27 +01:00
Christoph Kluge
f5f36427a4 split statsTable data from jobMetrics query, initial commit
- mainly backend changes
- statstable changes only for prototyping
2025-03-13 17:33:55 +01:00
exterr2f
16db9bd1a2 Fix node filter: Use EXISTS with Eq for exact match and LIKE for Contains 2025-03-11 12:20:13 +01:00
Christoph Kluge
c964f09a4f Merge branch 'dev' into review_logging 2025-02-28 17:19:00 +01:00
Christoph Kluge
0bc32f27df Merge branch 'dev' into migrate_svelte5 2025-02-28 17:18:30 +01:00
Christoph Kluge
bd0cc69668 Review fatalf log calls and messages 2025-02-27 18:10:04 +01:00
Christoph Kluge
84fffac264 Merge branch 'dev' into review_logging 2025-02-27 15:20:46 +01:00
Christoph Kluge
fc0c76bd77 Apply new log funtion to init and main, review or add logtexts 2025-02-26 15:20:25 +01:00
Christoph Kluge
d209547968 Remove dedicated fatal loglevel, change to Fprintln for unformatted 2025-02-26 14:40:54 +01:00
Christoph Kluge
0191bc3821 Annotate and review log functions, add stdout writers 2025-02-25 10:21:48 +01:00
Christoph Kluge
c4b98ade53 increase user table height, add but disable autocomplete attribute
- missing autocomplete attribute was logged as warning in chrome console
2025-02-05 15:18:42 +01:00
Christoph Kluge
f2e85306ca fix wrong label ids in options view
- allowed setting wrong field
2025-02-05 12:58:51 +01:00
Christoph Kluge
42b9de8360 add canvasId default, fix analysis view pie props 2025-02-05 12:51:06 +01:00
Christoph Kluge
6c244f3121 renderodelist spinner info only for continuous scroll 2025-02-04 18:41:10 +01:00
Christoph Kluge
9f56213d2f fix list view sorting of string fields 2025-02-04 17:52:11 +01:00
Christoph Kluge
fb2f7cf680 fix dirty vars on textfilter reset 2025-02-04 13:29:09 +01:00
Christoph Kluge
8fcdd24f84 Second onclick pass 2025-02-04 12:52:56 +01:00
Christoph Kluge
aaafde4a7c add function syntax to sveltestrap onclick events and others
- fixes event_handler_invalid svelte warning and blockage
2025-02-04 12:13:06 +01:00
Christoph Kluge
2b23003556 fix metric selection drag and drop 2025-02-03 19:36:28 +01:00
Christoph Kluge
5681062f01 Initial migration to Svelte5 via full syntax compatability
- updated all dependencies
- removed svelte-chartjs wrapper from dependencies
- sveltestrap causes compilation warnings (once)
- Header.svelte uses new Svelte5 syntax as example
- fixed most initial compilation warnings except circular dependencies with TBD cause
2025-02-03 17:31:01 +01:00
Michael Panzlaff
d61bf212f5 Fix 'make -B', don't fail if $(VAR) already exists 2025-02-03 17:02:13 +01:00
AmritanshuV
efbe53b6b4 Rules 2024-08-15 12:40:57 +02:00
649d50812b Merge branch 'master' into 134-job-tagging 2024-04-22 11:03:13 +02:00
2502989ca2 Refactor 2023-09-28 10:20:35 +02:00
ba7cc9168e feat: add automatic application detection and tagging 2023-09-28 10:20:20 +02:00
dc0d9fe038 Add more tags to test db 2023-09-27 15:01:08 +02:00
0e6c6937cd Merge branch 'master' into 134-job-tagging 2023-09-27 05:30:36 +02:00
d839c53642 Add initial structure 2023-08-22 10:56:32 +02:00
376 changed files with 59469 additions and 30171 deletions

View File

@@ -1,331 +0,0 @@
# See: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions
# Workflow name
name: Release
# Run on tag push
on:
push:
tags:
- '**'
jobs:
#
# Build on AlmaLinux 8.5 using golang-1.18.2
#
AlmaLinux-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:8.5
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
outputs:
rpm : ${{steps.rpmrename.outputs.RPM}}
srpm : ${{steps.rpmrename.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which npm
dnf --assumeyes install 'dnf-command(builddep)'
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
rpm -i go*.rpm
npm install --global yarn rollup svelte rollup-plugin-svelte
#dnf --assumeyes builddep build/package/cc-backend.spec
- name: RPM build ClusterCockpit
id: rpmbuild
run: make RPM
# AlmaLinux 8.5 is a derivate of RedHat Enterprise Linux 8 (UBI8),
# so the created RPM both contain the substring 'el8' in the RPM file names
# This step replaces the substring 'el8' to 'alma85'. It uses the move operation
# because it is unclear whether the default AlmaLinux 8.5 container contains the
# 'rename' command. This way we also get the new names for output.
- name: Rename RPMs (s/el8/alma85/)
id: rpmrename
run: |
OLD_RPM="${{steps.rpmbuild.outputs.RPM}}"
OLD_SRPM="${{steps.rpmbuild.outputs.SRPM}}"
NEW_RPM="${OLD_RPM/el8/alma85}"
NEW_SRPM=${OLD_SRPM/el8/alma85}
mv "${OLD_RPM}" "${NEW_RPM}"
mv "${OLD_SRPM}" "${NEW_SRPM}"
echo "::set-output name=SRPM::${NEW_SRPM}"
echo "::set-output name=RPM::${NEW_RPM}"
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend RPM for AlmaLinux 8.5
path: ${{ steps.rpmrename.outputs.RPM }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend SRPM for AlmaLinux 8.5
path: ${{ steps.rpmrename.outputs.SRPM }}
#
# Build on UBI 8 using golang-1.18.2
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: registry.access.redhat.com/ubi8/ubi:8.5-226.1645809065
# The job outputs link to the outputs of the 'rpmbuild' step
outputs:
rpm : ${{steps.rpmbuild.outputs.RPM}}
srpm : ${{steps.rpmbuild.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
rpm -i go*.rpm
dnf --assumeyes --disableplugin=subscription-manager install npm
npm install --global yarn rollup svelte rollup-plugin-svelte
#dnf --assumeyes builddep build/package/cc-backend.spec
- name: RPM build ClusterCockpit
id: rpmbuild
run: make RPM
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend RPM for UBI 8
path: ${{ steps.rpmbuild.outputs.RPM }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend SRPM for UBI 8
path: ${{ steps.rpmbuild.outputs.SRPM }}
#
# Build on Ubuntu 20.04 using official go 1.19.1 package
#
Ubuntu-focal-build:
runs-on: ubuntu-latest
container: ubuntu:20.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
apt --assume-yes install npm
npm install --global yarn rollup svelte rollup-plugin-svelte
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
- name: DEB build ClusterCockpit
id: dpkg-build
run: |
ls -la
pwd
env
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
git config --global --add safe.directory $(pwd)
make DEB
- name: Rename DEB (add '_ubuntu20.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "::set-output name=DEB::${NEW_DEB_FILE}"
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend DEB for Ubuntu 20.04
path: ${{ steps.debrename.outputs.DEB }}
#
# Build on Ubuntu 20.04 using official go 1.19.1 package
#
Ubuntu-jammy-build:
runs-on: ubuntu-latest
container: ubuntu:22.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash npm
npm install --global yarn rollup svelte rollup-plugin-svelte
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
- name: DEB build ClusterCockpit
id: dpkg-build
run: |
ls -la
pwd
env
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
git config --global --add safe.directory $(pwd)
make DEB
- name: Rename DEB (add '_ubuntu22.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu22.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "::set-output name=DEB::${NEW_DEB_FILE}"
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend DEB for Ubuntu 22.04
path: ${{ steps.debrename.outputs.DEB }}
#
# Create release with fresh RPMs
#
Release:
runs-on: ubuntu-latest
# We need the RPMs, so add dependency
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build, Ubuntu-jammy-build]
steps:
# See: https://github.com/actions/download-artifact
- name: Download AlmaLinux 8.5 RPM
uses: actions/download-artifact@v2
with:
name: cc-backend RPM for AlmaLinux 8.5
- name: Download AlmaLinux 8.5 SRPM
uses: actions/download-artifact@v2
with:
name: cc-backend SRPM for AlmaLinux 8.5
- name: Download UBI 8 RPM
uses: actions/download-artifact@v2
with:
name: cc-backend RPM for UBI 8
- name: Download UBI 8 SRPM
uses: actions/download-artifact@v2
with:
name: cc-backend SRPM for UBI 8
- name: Download Ubuntu 20.04 DEB
uses: actions/download-artifact@v2
with:
name: cc-backend DEB for Ubuntu 20.04
- name: Download Ubuntu 22.04 DEB
uses: actions/download-artifact@v2
with:
name: cc-backend DEB for Ubuntu 22.04
# The download actions do not publish the name of the downloaded file,
# so we re-use the job outputs of the parent jobs. The files are all
# downloaded to the current folder.
# The gh-release action afterwards does not accept file lists but all
# files have to be listed at 'files'. The step creates one output per
# RPM package (2 per distro)
- name: Set RPM variables
id: files
run: |
ALMA_85_RPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.rpm}}")
ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}")
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}")
U_2204_DEB=$(basename "${{ needs.Ubuntu-jammy-build.outputs.deb}}")
echo "ALMA_85_RPM::${ALMA_85_RPM}"
echo "ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "UBI_8_RPM::${UBI_8_RPM}"
echo "UBI_8_SRPM::${UBI_8_SRPM}"
echo "U_2004_DEB::${U_2004_DEB}"
echo "U_2204_DEB::${U_2204_DEB}"
echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}"
echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}"
echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}"
echo "::set-output name=U_2004_DEB::${U_2004_DEB}"
echo "::set-output name=U_2204_DEB::${U_2204_DEB}"
# See: https://github.com/softprops/action-gh-release
- name: Release
uses: softprops/action-gh-release@v1
if: startsWith(github.ref, 'refs/tags/')
with:
name: cc-backend-${{github.ref_name}}
files: |
${{ steps.files.outputs.ALMA_85_RPM }}
${{ steps.files.outputs.ALMA_85_SRPM }}
${{ steps.files.outputs.UBI_8_RPM }}
${{ steps.files.outputs.UBI_8_SRPM }}
${{ steps.files.outputs.U_2004_DEB }}
${{ steps.files.outputs.U_2204_DEB }}

View File

@@ -7,7 +7,7 @@ jobs:
- name: Install Go
uses: actions/setup-go@v4
with:
go-version: 1.22.x
go-version: 1.25.x
- name: Checkout code
uses: actions/checkout@v3
- name: Build, Vet & Test

13
.gitignore vendored
View File

@@ -1,14 +1,20 @@
/cc-backend
/.env
/config.json
/uiConfig.json
/var/job-archive
/var/machine-state
/var/job.db-shm
/var/job.db-wal
/var/*.db-shm
/var/*.db-wal
/var/*.db
/var/*.txt
/var/checkpoints*
migrateTimestamps.pl
test_ccms_*
/web/frontend/public/build
/web/frontend/node_modules
@@ -21,3 +27,6 @@
/.vscode/*
dist/
*.db
.idea
tools/archive-migration/archive-migration
tools/archive-manager/archive-manager

26
AGENTS.md Normal file
View File

@@ -0,0 +1,26 @@
# ClusterCockpit Backend - Agent Guidelines
## Build/Test Commands
- Build: `make` or `go build ./cmd/cc-backend`
- Run all tests: `make test` (runs: `go clean -testcache && go build ./... && go vet ./... && go test ./...`)
- Run single test: `go test -run TestName ./path/to/package`
- Run single test file: `go test ./path/to/package -run TestName`
- Frontend build: `cd web/frontend && npm install && npm run build`
- Generate GraphQL: `make graphql` (uses gqlgen)
- Generate Swagger: `make swagger` (uses swaggo/swag)
## Code Style
- **Formatting**: Use `gofumpt` for all Go files (strict requirement)
- **Copyright header**: All files must include copyright header (see existing files)
- **Package docs**: Document packages with comprehensive package-level comments explaining purpose, usage, configuration
- **Imports**: Standard library first, then external packages, then internal packages (grouped with blank lines)
- **Naming**: Use camelCase for private, PascalCase for exported; descriptive names (e.g., `JobRepository`, `handleError`)
- **Error handling**: Return errors, don't panic; use custom error types where appropriate; log with cclog package
- **Logging**: Use `cclog` package (e.g., `cclog.Errorf()`, `cclog.Warnf()`, `cclog.Debugf()`)
- **Testing**: Use standard `testing` package; use `testify/assert` for assertions; name tests `TestFunctionName`
- **Comments**: Document all exported functions/types with godoc-style comments
- **Structs**: Document fields with inline comments, especially for complex configurations
- **HTTP handlers**: Return proper status codes; use `handleError()` helper for consistent error responses
- **JSON**: Use struct tags for JSON marshaling; `DisallowUnknownFields()` for strict decoding

306
CLAUDE.md Normal file
View File

@@ -0,0 +1,306 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with
code in this repository.
## Project Overview
ClusterCockpit is a job-specific performance monitoring framework for HPC
clusters. This is a Golang backend that provides REST and GraphQL APIs, serves a
Svelte-based frontend, and manages job archives and metric data from various
time-series databases.
## Build and Development Commands
### Building
```bash
# Build everything (frontend + backend)
make
# Build only the frontend
make frontend
# Build only the backend (requires frontend to be built first)
go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.5.0 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend
```
### Testing
```bash
# Run all tests
make test
# Run tests with verbose output
go test -v ./...
# Run tests for a specific package
go test ./internal/repository
```
### Code Generation
```bash
# Regenerate GraphQL schema and resolvers (after modifying api/schema.graphqls)
make graphql
# Regenerate Swagger/OpenAPI docs (after modifying API comments)
make swagger
```
### Frontend Development
```bash
cd web/frontend
# Install dependencies
npm install
# Build for production
npm run build
# Development mode with watch
npm run dev
```
### Running
```bash
# Initialize database and create admin user
./cc-backend -init-db -add-user demo:admin:demo
# Start server in development mode (enables GraphQL Playground and Swagger UI)
./cc-backend -server -dev -loglevel info
# Start demo with sample data
./startDemo.sh
```
## Architecture
### Backend Structure
The backend follows a layered architecture with clear separation of concerns:
- **cmd/cc-backend**: Entry point, orchestrates initialization of all subsystems
- **internal/repository**: Data access layer using repository pattern
- Abstracts database operations (SQLite3 only)
- Implements LRU caching for performance
- Provides repositories for Job, User, Node, and Tag entities
- Transaction support for batch operations
- **internal/api**: REST API endpoints (Swagger/OpenAPI documented)
- **internal/graph**: GraphQL API (uses gqlgen)
- Schema in `api/schema.graphqls`
- Generated code in `internal/graph/generated/`
- Resolvers in `internal/graph/schema.resolvers.go`
- **internal/auth**: Authentication layer
- Supports local accounts, LDAP, OIDC, and JWT tokens
- Implements rate limiting for login attempts
- **pkg/metricstore**: Metric store with data loading API
- In-memory metric storage with checkpointing
- Query API for loading job metric data
- **internal/archiver**: Job archiving to file-based archive
- **internal/api/nats.go**: NATS-based API for job and node operations
- Subscribes to NATS subjects for job events (start/stop)
- Handles node state updates via NATS
- Uses InfluxDB line protocol message format
- **pkg/archive**: Job archive backend implementations
- File system backend (default)
- S3 backend
- SQLite backend (experimental)
- **parquet** sub-package: Parquet format support (schema, reader, writer, conversion)
- **internal/metricstoreclient**: Client for cc-metric-store queries
### Frontend Structure
- **web/frontend**: Svelte 5 application
- Uses Rollup for building
- Components organized by feature (analysis, job, user, etc.)
- GraphQL client using @urql/svelte
- Bootstrap 5 + SvelteStrap for UI
- uPlot for time-series visualization
- **web/templates**: Server-side Go templates
### Key Concepts
**Job Archive**: Completed jobs are stored in a file-based archive following the
[ClusterCockpit job-archive
specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
Each job has a `meta.json` file with metadata and metric data files.
**Metric Data Repositories**: Time-series metric data is stored separately from
job metadata. The system supports multiple backends (cc-metric-store is
recommended). Configuration is per-cluster in `config.json`.
**Authentication Flow**:
1. Multiple authenticators can be configured (local, LDAP, OIDC, JWT)
2. Each authenticator's `CanLogin` method is called to determine if it should handle the request
3. The first authenticator that returns true performs the actual `Login`
4. JWT tokens are used for API authentication
**Database Migrations**: SQL migrations in `internal/repository/migrations/sqlite3/` are
applied automatically on startup. Version tracking in `version` table.
**Scopes**: Metrics can be collected at different scopes:
- Node scope (always available)
- Core scope (for jobs with ≤8 nodes)
- Accelerator scope (for GPU/accelerator metrics)
## Configuration
- **config.json**: Main configuration (clusters, metric repositories, archive settings)
- `main.apiSubjects`: NATS subject configuration (optional)
- `subjectJobEvent`: Subject for job start/stop events (e.g., "cc.job.event")
- `subjectNodeState`: Subject for node state updates (e.g., "cc.node.state")
- `nats`: NATS client connection configuration (optional)
- `address`: NATS server address (e.g., "nats://localhost:4222")
- `username`: Authentication username (optional)
- `password`: Authentication password (optional)
- `creds-file-path`: Path to NATS credentials file (optional)
- **.env**: Environment variables (secrets like JWT keys)
- Copy from `configs/env-template.txt`
- NEVER commit this file
- **cluster.json**: Cluster topology and metric definitions (loaded from archive or config)
## Database
- Default: SQLite 3 (`./var/job.db`)
- Connection managed by `internal/repository`
- Schema version in `internal/repository/migration.go`
## Code Generation
**GraphQL** (gqlgen):
- Schema: `api/schema.graphqls`
- Config: `gqlgen.yml`
- Generated code: `internal/graph/generated/`
- Custom resolvers: `internal/graph/schema.resolvers.go`
- Run `make graphql` after schema changes
**Swagger/OpenAPI**:
- Annotations in `internal/api/*.go`
- Generated docs: `internal/api/docs.go`, `api/swagger.yaml`
- Run `make swagger` after API changes
## Testing Conventions
- Test files use `_test.go` suffix
- Test data in `testdata/` subdirectories
- Repository tests use in-memory SQLite
- API tests use httptest
## Common Workflows
### Adding a new GraphQL field
1. Edit schema in `api/schema.graphqls`
2. Run `make graphql`
3. Implement resolver in `internal/graph/schema.resolvers.go`
### Adding a new REST endpoint
1. Add handler in `internal/api/*.go`
2. Add route in `internal/api/rest.go`
3. Add Swagger annotations
4. Run `make swagger`
### Adding a new metric data backend
1. Implement metric loading functions in `pkg/metricstore/query.go`
2. Add cluster configuration to metric store initialization
3. Update config.json schema documentation
### Modifying database schema
1. Create new migration in `internal/repository/migrations/sqlite3/`
2. Increment `repository.Version`
3. Test with fresh database and existing database
## NATS API
The backend supports a NATS-based API as an alternative to the REST API for job and node operations.
### Setup
1. Configure NATS client connection in `config.json`:
```json
{
"nats": {
"address": "nats://localhost:4222",
"username": "user",
"password": "pass"
}
}
```
2. Configure API subjects in `config.json` under `main`:
```json
{
"main": {
"apiSubjects": {
"subjectJobEvent": "cc.job.event",
"subjectNodeState": "cc.node.state"
}
}
}
```
### Message Format
Messages use **InfluxDB line protocol** format with the following structure:
#### Job Events
**Start Job:**
```
job,function=start_job event="{\"jobId\":123,\"user\":\"alice\",\"cluster\":\"test\", ...}" 1234567890000000000
```
**Stop Job:**
```
job,function=stop_job event="{\"jobId\":123,\"cluster\":\"test\",\"startTime\":1234567890,\"stopTime\":1234571490,\"jobState\":\"completed\"}" 1234571490000000000
```
**Tags:**
- `function`: Either `start_job` or `stop_job`
**Fields:**
- `event`: JSON payload containing job data (see REST API documentation for schema)
#### Node State Updates
```json
{
"cluster": "testcluster",
"nodes": [
{
"hostname": "node001",
"states": ["allocated"],
"cpusAllocated": 8,
"memoryAllocated": 16384,
"gpusAllocated": 0,
"jobsRunning": 1
}
]
}
```
### Implementation Notes
- NATS API mirrors REST API functionality but uses messaging
- Job start/stop events are processed asynchronously
- Duplicate job detection is handled (same as REST API)
- All validation rules from REST API apply
- Messages are logged; no responses are sent back to publishers
- If NATS client is unavailable, API subscriptions are skipped (logged as warning)
## Dependencies
- Go 1.24.0+ (check go.mod for exact version)
- Node.js (for frontend builds)
- SQLite 3 (only supported database)
- Optional: NATS server for NATS API integration

View File

@@ -1,8 +1,6 @@
TARGET = ./cc-backend
VAR = ./var
CFG = config.json .env
FRONTEND = ./web/frontend
VERSION = 1.4.3
VERSION = 1.5.0
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
@@ -42,22 +40,22 @@ SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
.NOTPARALLEL:
$(TARGET): $(VAR) $(CFG) $(SVELTE_TARGETS)
$(TARGET): $(SVELTE_TARGETS)
$(info ===> BUILD cc-backend)
@go build -ldflags=${LD_FLAGS} ./cmd/cc-backend
frontend:
$(info ===> BUILD frontend)
cd web/frontend && npm install && npm run build
cd web/frontend && npm ci && npm run build
swagger:
$(info ===> GENERATE swagger)
@go run github.com/swaggo/swag/cmd/swag init -d ./internal/api,./pkg/schema -g rest.go -o ./api
@go tool github.com/swaggo/swag/cmd/swag init --parseDependency -d ./internal/api -g rest.go -o ./api
@mv ./api/docs.go ./internal/api/docs.go
graphql:
$(info ===> GENERATE graphql)
@go run github.com/99designs/gqlgen
@go tool github.com/99designs/gqlgen
clean:
$(info ===> CLEAN)
@@ -68,7 +66,7 @@ distclean:
@$(MAKE) clean
$(info ===> DISTCLEAN)
@rm -rf $(FRONTEND)/node_modules
@rm -rf $(VAR)
@rm -rf ./var
test:
$(info ===> TESTING)
@@ -84,14 +82,6 @@ tags:
$(VAR):
@mkdir -p $(VAR)
config.json:
$(info ===> Initialize config.json file)
@cp configs/config.json config.json
.env:
$(info ===> Initialize .env file)
@cp configs/env-template.txt .env
$(SVELTE_TARGETS): $(SVELTE_SRC)
$(info ===> BUILD frontend)
cd web/frontend && npm install && npm run build
cd web/frontend && npm ci && npm run build

149
README.md
View File

@@ -1,5 +1,8 @@
# NOTE
While we do our best to keep the master branch in a usable state, there is no guarantee the master branch works.
Please do not use it for production!
Please have a look at the [Release
Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md)
for breaking changes!
@@ -19,19 +22,23 @@ switching from PHP Symfony to a Golang based solution are explained
## Overview
This is a Golang web backend for the ClusterCockpit job-specific performance
monitoring framework. It provides a REST API for integrating ClusterCockpit with
an HPC cluster batch system and external analysis scripts. Data exchange between
the web front-end and the back-end is based on a GraphQL API. The web frontend
is also served by the backend using [Svelte](https://svelte.dev/) components.
Layout and styling are based on [Bootstrap 5](https://getbootstrap.com/) using
monitoring framework. It provides a REST API and an optional NATS-based messaging
API for integrating ClusterCockpit with an HPC cluster batch system and external
analysis scripts. Data exchange between the web front-end and the back-end is
based on a GraphQL API. The web frontend is also served by the backend using
[Svelte](https://svelte.dev/) components. Layout and styling are based on
[Bootstrap 5](https://getbootstrap.com/) using
[Bootstrap Icons](https://icons.getbootstrap.com/).
The backend uses [SQLite 3](https://sqlite.org/) as a relational SQL database by
default. Optionally it can use a MySQL/MariaDB database server. While there are
metric data backends for the InfluxDB and Prometheus time series databases, the
only tested and supported setup is to use cc-metric-store as the metric data
backend. Documentation on how to integrate ClusterCockpit with other time series
databases will be added in the future.
The backend uses [SQLite 3](https://sqlite.org/) as the relational SQL database.
While there are metric data backends for the InfluxDB and Prometheus time series
databases, the only tested and supported setup is to use cc-metric-store as the
metric data backend. Documentation on how to integrate ClusterCockpit with other
time series databases will be added in the future.
For real-time integration with HPC systems, the backend can subscribe to
[NATS](https://nats.io/) subjects to receive job start/stop events and node
state updates, providing an alternative to REST API polling.
Completed batch jobs are stored in a file-based job archive according to
[this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
@@ -69,7 +76,7 @@ You can also try the demo using the latest release binary.
Create a folder and put the release binary `cc-backend` into this folder.
Execute the following steps:
``` shell
```shell
./cc-backend -init
vim config.json (Add a second cluster entry and name the clusters alex and fritz)
wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar
@@ -88,11 +95,11 @@ Analysis, Systems and Status views).
There is a Makefile to automate the build of cc-backend. The Makefile supports
the following targets:
* `make`: Initialize `var` directory and build svelte frontend and backend
binary. Note that there is no proper prerequisite handling. Any change of
frontend source files will result in a complete rebuild.
* `make clean`: Clean go build cache and remove binary.
* `make test`: Run the tests that are also run in the GitHub workflow setup.
- `make`: Initialize `var` directory and build svelte frontend and backend
binary. Note that there is no proper prerequisite handling. Any change of
frontend source files will result in a complete rebuild.
- `make clean`: Clean go build cache and remove binary.
- `make test`: Run the tests that are also run in the GitHub workflow setup.
A common workflow for setting up cc-backend from scratch is:
@@ -128,41 +135,73 @@ ln -s <your-existing-job-archive> ./var/job-archive
## Project file structure
* [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
contains the API schema files for the REST and GraphQL APIs. The REST API is
documented in the OpenAPI 3.0 format in
[./api/openapi.yaml](./api/openapi.yaml).
* [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
contains `main.go` for the main application.
* [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
contains documentation about configuration and command line options and required
environment variables. A sample configuration file is provided.
* [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs)
contains more in-depth documentation.
* [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
contains an example of setting up systemd for production use.
* [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
contains library source code that is not intended for use by others.
* [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
contains Go packages that can be used by other projects.
* [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
Additional command line helper tools.
* [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
Commands for getting infos about and existing job archive.
* [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
Tool to convert external pubkey for use in `cc-backend`.
* [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
contains a small application to generate a compatible JWT keypair. You find
documentation on how to use it
[here](https://github.com/ClusterCockpit/cc-backend/blob/master/docs/JWT-Handling.md).
* [`web/`](https://github.com/ClusterCockpit/cc-backend/tree/master/web)
Server-side templates and frontend-related files:
* [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
Svelte components and static assets for the frontend UI
* [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
Server-side Go templates
* [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
Configures the behaviour and generation of
[gqlgen](https://github.com/99designs/gqlgen).
* [`startDemo.sh`](https://github.com/ClusterCockpit/cc-backend/blob/master/startDemo.sh)
is a shell script that sets up demo data, and builds and starts `cc-backend`.
- [`.github/`](https://github.com/ClusterCockpit/cc-backend/tree/master/.github)
GitHub Actions workflows and dependabot configuration for CI/CD.
- [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
contains the API schema files for the REST and GraphQL APIs. The REST API is
documented in the OpenAPI 3.0 format in
[./api/swagger.yaml](./api/swagger.yaml). The GraphQL schema is in
[./api/schema.graphqls](./api/schema.graphqls).
- [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
contains the main application entry point and CLI implementation.
- [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
contains documentation about configuration and command line options and required
environment variables. Sample configuration files are provided.
- [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
contains an example of setting up systemd for production use.
- [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
contains library source code that is not intended for use by others.
- [`api`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/api)
REST API handlers and NATS integration
- [`archiver`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/archiver)
Job archiving functionality
- [`auth`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/auth)
Authentication (local, LDAP, OIDC) and JWT token handling
- [`config`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/config)
Configuration management and validation
- [`graph`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/graph)
GraphQL schema and resolvers
- [`importer`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/importer)
Job data import and database initialization
- [`metricdispatch`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricdispatch)
Dispatches metric data loading to appropriate backends
- [`repository`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/repository)
Database repository layer for jobs and metadata
- [`routerConfig`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/routerConfig)
HTTP router configuration and middleware
- [`tagger`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/tagger)
Job classification and application detection
- [`taskmanager`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/taskmanager)
Background task management and scheduled jobs
- [`metricstoreclient`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricstoreclient)
Client for cc-metric-store queries
- [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
contains Go packages that can be used by other projects.
- [`archive`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/archive)
Job archive backend implementations (filesystem, S3, SQLite)
- [`metricstore`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/metricstore)
In-memory metric data store with checkpointing and metric loading
- [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
Additional command line helper tools.
- [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
Commands for getting infos about an existing job archive, importing jobs
between archive backends, and converting archives between JSON and Parquet formats.
- [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
Tool for migrating job archives between formats.
- [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
Tool to convert external pubkey for use in `cc-backend`.
- [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
contains a small application to generate a compatible JWT keypair. You find
documentation on how to use it
[here](https://github.com/ClusterCockpit/cc-backend/blob/master/docs/JWT-Handling.md).
- [`web/`](https://github.com/ClusterCockpit/cc-backend/tree/master/web)
Server-side templates and frontend-related files:
- [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
Svelte components and static assets for the frontend UI
- [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
Server-side Go templates, including monitoring views
- [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
Configures the behaviour and generation of
[gqlgen](https://github.com/99designs/gqlgen).
- [`startDemo.sh`](https://github.com/ClusterCockpit/cc-backend/blob/master/startDemo.sh)
is a shell script that sets up demo data, and builds and starts `cc-backend`.

View File

@@ -1,47 +1,277 @@
# `cc-backend` version 1.4.3
# `cc-backend` version 1.5.0
Supports job archive version 2 and database version 8.
Supports job archive version 3 and database version 10.
This is a bug fix release of `cc-backend`, the API backend and frontend
This is a feature release of `cc-backend`, the API backend and frontend
implementation of ClusterCockpit.
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
## Breaking changes for minor release 1.4.x
## Breaking changes
- You need to perform a database migration. Depending on your database size the
migration might require several hours!
- You need to adapt the `cluster.json` configuration files in the job-archive,
add new required attributes to the metric list and after that edit
`./job-archive/version.txt` to version 2. Only metrics that have the footprint
attribute set can be filtered and show up in the footprint UI and polar plot.
- Continuous scrolling is default now in all job lists. You can change this back
to paging globally, also every user can configure to use paging or continuous
scrolling individually.
- Tags have a scope now. Existing tags will get global scope in the database
migration.
### Configuration changes
## New features
- **JSON attribute naming**: All JSON configuration attributes now use `kebab-case`
style consistently (e.g., `api-allowed-ips` instead of `apiAllowedIPs`).
Update your `config.json` accordingly.
- **Removed `disable-archive` option**: This obsolete configuration option has been removed.
- **Removed `clusters` config section**: The separate clusters configuration section
has been removed. Cluster information is now derived from the job archive.
- **`apiAllowedIPs` is now optional**: If not specified, defaults to not
restricted.
- Detailed Node List
- Adds new routes `/systems/list/$cluster` and `/systems/list/$cluster/$subcluster`
- Displays live, scoped metric data requested from the nodes indepenent of jobs
- Color Blind Mode
- Set on a per-user basis in options
- Applies to plot data, plot background color, statsseries colors, roofline timescale
- Job-View metric selection is now persisted based on the jobs subcluster.
Helpful for heterogeneous subcluster configurations.
- Histogram Bin Select in User-View
- Metric-Histograms: `10 Bins` now default, selectable options `20, 50, 100`
- Job-Duration-Histogram: `48h in 1h Bins` now default, selectable options:
- `60 minutes in 1 minute Bins`
- `12 hours in 10 minute Bins`
- `3 days in 6 hour Bins`
- `7 days in 12 hour Bins`
### Architecture changes
- **Web framework replaced**: Migrated from `gorilla/mux` to `chi` as the HTTP
router. This should be transparent to users but affects how middleware and
routes are composed. A proper 404 handler is now in place.
- **MetricStore moved**: The `metricstore` package has been moved from `internal/`
to `pkg/` as it is now part of the public API.
- **MySQL/MariaDB support removed**: Only SQLite is now supported as the database backend.
- **Archive to Cleanup renaming**: Archive-related functions have been refactored
and renamed to "Cleanup" for clarity.
- **`minRunningFor` filter removed**: This undocumented filter has been removed
from the API and frontend.
### Dependency changes
- **cc-lib v2.5.1**: Switched to cc-lib version 2 with updated APIs (currently at v2.5.1)
- **cclib NATS client**: Now using the cclib NATS client implementation
- Removed obsolete `util.Float` usage from cclib
## Major new features
### NATS API Integration
- **Real-time job events**: Subscribe to job start/stop events via NATS
- **Node state updates**: Receive real-time node state changes via NATS
- **Configurable subjects**: NATS API subjects are now configurable via `api-subjects`
- **Deadlock fixes**: Improved NATS client stability and graceful shutdown
### Public Dashboard
- **Public-facing interface**: New public dashboard route for external users
- **DoubleMetricPlot component**: New visualization component for comparing metrics
- **Improved layout**: Reviewed and optimized dashboard layouts for better readability
### Enhanced Node Management
- **Node state tracking**: New node table in database with timestamp tracking
- **Node state filtering**: Filter jobs by node state in systems view
- **Node list enhancements**: Improved paging, filtering, and continuous scroll support
- **Nodestate retention and archiving**: Node state data is now subject to configurable
retention policies and can be archived to Parquet format for long-term storage
- **Faulty node metric tracking**: Faulty node state metric lists are persisted to the database
### Health Monitoring
- **Health status dashboard**: New dedicated "Health" tab in the status details view
showing per-node metric health across the cluster
- **CCMS health check**: Support for querying health status of external
cc-metric-store (CCMS) instances via the API
- **GraphQL health endpoints**: New GraphQL queries and resolvers for health data
- **Cluster/subcluster filter**: Filter health status view by cluster or subcluster
### Log Viewer
- **Web-based log viewer**: New log viewer page in the admin interface for inspecting
backend log output directly from the browser without shell access
- **Accessible from header**: Quick access link from the navigation header
### MetricStore Improvements
- **Memory tracking worker**: New worker for CCMS memory usage tracking
- **Dynamic retention**: Support for job specific dynamic retention times
- **Improved compression**: Transparent compression for job archive imports
- **Parallel processing**: Parallelized Iter function in all archive backends
### Job Tagging System
- **Job tagger option**: Enable automatic job tagging via configuration flag
- **Application detection**: Automatic detection of applications (MATLAB, GROMACS, etc.)
- **Job classification**: Automatic detection of pathological jobs
- **omit-tagged**: Option to exclude tagged jobs from retention/cleanup operations (`none`, `all`, or `user`)
- **Admin UI trigger**: Taggers can be run on-demand from the admin web interface
without restarting the backend
### Archive Backends
- **Parquet archive format**: New Parquet file format for job archiving, providing
columnar storage with efficient compression for analytical workloads
- **S3 backend**: Full support for S3-compatible object storage
- **SQLite backend**: Full support for SQLite backend using blobs
- **Performance improvements**: Fixed performance bugs in archive backends
- **Better error handling**: Improved error messages and fallback handling
- **Zstd compression**: Parquet writers use zstd compression for better
compression ratios compared to the previous snappy default
- **Optimized sort order**: Job and nodestate Parquet files are sorted by
cluster, subcluster, and start time for efficient range queries
### Unified Archive Retention and Format Conversion
- **Uniform retention policy**: Job archive retention now supports both JSON and
Parquet as target formats under a single, consistent policy configuration
- **Archive manager tool**: The `tools/archive-manager` utility now supports
format conversion between JSON and Parquet job archives
- **Parquet reader**: Full Parquet archive reader implementation for reading back
archived job data
## New features and improvements
### Frontend
- **Loading indicators**: Added loading indicators to status detail and job lists
- **Job info layout**: Reviewed and improved job info row layout
- **Metric selection**: Enhanced metric selection with drag-and-drop fixes
- **Filter presets**: Move list filter preset to URL for easy sharing
- **Job comparison**: Improved job comparison views and plots
- **Subcluster reactivity**: Job list now reacts to subcluster filter changes
- **Short jobs quick selection**: New "Short jobs" quick-filter button in job lists
replaces the removed undocumented `minRunningFor` filter
- **Row plot cursor sync**: Cursor position is now synchronized across all metric
plots in a job list row for easier cross-metric comparison
- **Disabled metrics handling**: Improved handling and display of disabled metrics
across job view, node view, and list rows
- **"Not configured" info cards**: Informational cards shown when optional features
are not yet configured
- **Frontend dependencies**: Bumped frontend dependencies to latest versions
- **Svelte 5 compatibility**: Fixed Svelte state warnings and compatibility issues
### Backend
- **Progress bars**: Import function now shows progress during long operations
- **Better logging**: Improved logging with appropriate log levels throughout
- **Graceful shutdown**: Fixed shutdown timeout bugs and hanging issues
- **Configuration defaults**: Sensible defaults for most configuration options
- **Documentation**: Extensive documentation improvements across packages
- **Server flag in systemd unit**: Example systemd unit now includes the `-server` flag
### Security
- **LDAP security hardening**: Improved input validation, connection handling, and
error reporting in the LDAP authenticator
- **OIDC security hardening**: Stricter token validation and improved error handling
in the OIDC authenticator
- **Auth schema extensions**: Additional schema fields for improved auth configuration
### API improvements
- **Role-based metric visibility**: Metrics can now have role-based access control
- **Job exclusivity filter**: New filter for exclusive vs. shared jobs
- **Improved error messages**: Better error messages and documentation in REST API
- **GraphQL enhancements**: Improved GraphQL queries and resolvers
- **Stop job lookup order**: Reversed lookup order in stop job requests for
more reliable job matching (cluster+jobId first, then jobId alone)
### Performance
- **Database indices**: Optimized SQLite indices for better query performance
- **Job cache**: Introduced caching table for faster job inserts
- **Parallel imports**: Archive imports now run in parallel where possible
- **External tool integration**: Optimized use of external tools (fd) for better performance
- **Node repository queries**: Reviewed and optimized node repository SQL queries
- **Buffer pool**: Resized and pooled internal buffers for better memory reuse
### Developer experience
- **AI agent guidelines**: Added documentation for AI coding agents (AGENTS.md, CLAUDE.md)
- **Example API payloads**: Added example JSON API payloads for testing
- **Unit tests**: Added more unit tests for NATS API, node repository, and other components
- **Test improvements**: Better test coverage; test DB is now copied before unit tests
to avoid state pollution between test runs
- **Parquet writer tests**: Comprehensive tests for Parquet archive writing and conversion
## Bug fixes
- Fixed nodelist paging issues
- Fixed metric select drag and drop functionality
- Fixed render race conditions in nodeList
- Fixed tag count grouping including type
- Fixed wrong metricstore schema (missing comma)
- Fixed configuration issues causing shutdown hangs
- Fixed deadlock when NATS is not configured
- Fixed archive backend performance bugs
- Fixed continuous scroll buildup on refresh
- Improved footprint calculation logic
- Fixed polar plot data query decoupling
- Fixed missing resolution parameter handling
- Fixed node table initialization fallback
- Fixed reactivity key placement in nodeList
- Fixed nodeList resolver data handling and increased nodestate filter cutoff
- Fixed job always being transferred to main job table before archiving
- Fixed AppTagger error handling and logging
- Fixed log endpoint formatting and correctness
- Fixed automatic refresh in metric status tab
- Fixed NULL value handling in `health_state` and `health_metrics` columns
- Fixed bugs related to `job_cache` IDs being used in the main job table
- Fixed SyncJobs bug causing start job hooks to be called with wrong (cache) IDs
- Fixed 404 handler route for sub-routers
## Configuration changes
### New configuration options
```json
{
"main": {
"enable-job-taggers": true,
"resampling": {
"minimum-points": 600,
"trigger": 180,
"resolutions": [240, 60]
},
"api-subjects": {
"subject-job-event": "cc.job.event",
"subject-node-state": "cc.node.state"
}
},
"nats": {
"address": "nats://0.0.0.0:4222",
"username": "root",
"password": "root"
},
"cron": {
"commit-job-worker": "1m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
"metric-store": {
"cleanup": {
"mode": "archive",
"interval": "48h",
"directory": "./var/archive"
}
},
"archive": {
"retention": {
"policy": "delete",
"age": "6months",
"target-format": "parquet"
}
},
"nodestate": {
"retention": {
"policy": "archive",
"age": "30d",
"archive-path": "./var/nodestate-archive"
}
}
}
```
## Migration notes
- Review and update your `config.json` to use kebab-case attribute names
- If using NATS, configure the new `nats` and `api-subjects` sections
- If using S3 archive backend, configure the new `archive` section options
- Test the new public dashboard at `/public` route
- Review cron worker configuration if you need different frequencies
- If using the archive retention feature, configure the `target-format` option
to choose between `json` (default) and `parquet` output formats
- Consider enabling nodestate retention if you track node states over time
## Known issues
- Currently energy footprint metrics of type energy are ignored for calculating
total energy.
- Resampling for running jobs only works with cc-metric-store
- With energy footprint metrics of type power the unit is ignored and it is
assumed the metric has the unit Watt.

View File

@@ -4,61 +4,89 @@ scalar Any
scalar NullableFloat
scalar MetricScope
scalar JobState
scalar SchedulerState
scalar MonitoringState
type Node {
id: ID!
hostname: String!
cluster: String!
subCluster: String!
jobsRunning: Int!
cpusAllocated: Int
memoryAllocated: Int
gpusAllocated: Int
schedulerState: SchedulerState!
healthState: MonitoringState!
metaData: Any
healthData: Any
}
type NodeStates {
state: String!
count: Int!
}
type NodeStatesTimed {
state: String!
counts: [Int!]!
times: [Int!]!
}
type Job {
id: ID!
jobId: Int!
user: String!
project: String!
cluster: String!
subCluster: String!
startTime: Time!
duration: Int!
walltime: Int!
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
energy: Float!
SMT: Int!
exclusive: Int!
partition: String!
arrayJobId: Int!
id: ID!
jobId: Int!
user: String!
project: String!
cluster: String!
subCluster: String!
startTime: Time!
duration: Int!
walltime: Int!
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
energy: Float!
SMT: Int!
shared: String!
partition: String!
arrayJobId: Int!
monitoringStatus: Int!
state: JobState!
tags: [Tag!]!
resources: [Resource!]!
concurrentJobs: JobLinkResultList
footprint: [FootprintValue]
energyFootprint: [EnergyFootprintValue]
metaData: Any
userData: User
state: JobState!
tags: [Tag!]!
resources: [Resource!]!
concurrentJobs: JobLinkResultList
footprint: [FootprintValue]
energyFootprint: [EnergyFootprintValue]
metaData: Any
userData: User
}
type JobLink {
id: ID!
jobId: Int!
id: ID!
jobId: Int!
}
type Cluster {
name: String!
partitions: [String!]! # Slurm partitions
subClusters: [SubCluster!]! # Hardware partitions/subclusters
name: String!
partitions: [String!]! # Slurm partitions
subClusters: [SubCluster!]! # Hardware partitions/subclusters
}
type SubCluster {
name: String!
nodes: String!
numberOfNodes: Int!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: MetricValue!
flopRateSimd: MetricValue!
name: String!
nodes: String!
numberOfNodes: Int!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: MetricValue!
flopRateSimd: MetricValue!
memoryBandwidth: MetricValue!
topology: Topology!
metricConfig: [MetricConfig!]!
footprint: [String!]!
topology: Topology!
metricConfig: [MetricConfig!]!
footprint: [String!]!
}
type FootprintValue {
@@ -80,80 +108,119 @@ type MetricValue {
}
type Topology {
node: [Int!]
socket: [[Int!]!]
node: [Int!]
socket: [[Int!]!]
memoryDomain: [[Int!]!]
die: [[Int!]!]
core: [[Int!]!]
die: [[Int!]!]
core: [[Int!]!]
accelerators: [Accelerator!]
}
type Accelerator {
id: String!
type: String!
id: String!
type: String!
model: String!
}
type SubClusterConfig {
name: String!
peak: Float
normal: Float
name: String!
peak: Float
normal: Float
caution: Float
alert: Float
remove: Boolean
alert: Float
remove: Boolean
}
type MetricConfig {
name: String!
unit: Unit!
scope: MetricScope!
name: String!
unit: Unit!
scope: MetricScope!
aggregation: String!
timestep: Int!
peak: Float!
normal: Float
timestep: Int!
peak: Float!
normal: Float
caution: Float!
alert: Float!
alert: Float!
lowerIsBetter: Boolean
subClusters: [SubClusterConfig!]!
}
type Tag {
id: ID!
id: ID!
type: String!
name: String!
scope: String!
}
type Resource {
hostname: String!
hwthreads: [Int!]
accelerators: [String!]
hostname: String!
hwthreads: [Int!]
accelerators: [String!]
configuration: String
}
type JobMetricWithName {
name: String!
scope: MetricScope!
name: String!
scope: MetricScope!
metric: JobMetric!
}
type JobMetricStatWithName {
name: String!
stats: MetricStatistics!
type ClusterMetricWithName {
name: String!
unit: Unit
timestep: Int!
data: [NullableFloat!]!
}
type JobMetric {
unit: Unit
timestep: Int!
series: [Series!]
unit: Unit
timestep: Int!
series: [Series!]
statisticsSeries: StatsSeries
}
type Series {
hostname: String!
id: String
hostname: String!
id: String
statistics: MetricStatistics
data: [NullableFloat!]!
data: [NullableFloat!]!
}
type StatsSeries {
mean: [NullableFloat!]!
median: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type NamedStatsWithScope {
name: String!
scope: MetricScope!
stats: [ScopedStats!]!
}
type ScopedStats {
hostname: String!
id: String
data: MetricStatistics!
}
type JobStats {
id: Int!
jobId: String!
startTime: Int!
duration: Int!
cluster: String!
subCluster: String!
numNodes: Int!
numHWThreads: Int
numAccelerators: Int
stats: [NamedStats!]!
}
type NamedStats {
name: String!
data: MetricStatistics!
}
type Unit {
@@ -167,21 +234,14 @@ type MetricStatistics {
max: Float!
}
type StatsSeries {
mean: [NullableFloat!]!
median: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type MetricFootprints {
metric: String!
data: [NullableFloat!]!
data: [NullableFloat!]!
}
type Footprints {
timeWeights: TimeWeights!
metrics: [MetricFootprints!]!
metrics: [MetricFootprints!]!
}
type TimeWeights {
@@ -190,20 +250,41 @@ type TimeWeights {
coreHours: [NullableFloat!]!
}
enum Aggregate { USER, PROJECT, CLUSTER }
enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS }
enum Aggregate {
USER
PROJECT
CLUSTER
SUBCLUSTER
}
enum SortByAggregate {
TOTALWALLTIME
TOTALJOBS
TOTALUSERS
TOTALNODES
TOTALNODEHOURS
TOTALCORES
TOTALCOREHOURS
TOTALACCS
TOTALACCHOURS
}
type NodeMetrics {
host: String!
host: String!
state: String!
subCluster: String!
metrics: [JobMetricWithName!]!
metrics: [JobMetricWithName!]!
}
type ClusterMetrics {
nodeCount: Int!
metrics: [ClusterMetricWithName!]!
}
type NodesResultList {
items: [NodeMetrics!]!
items: [NodeMetrics!]!
offset: Int
limit: Int
count: Int
limit: Int
count: Int
totalNodes: Int
hasNextPage: Boolean
}
@@ -222,14 +303,14 @@ type GlobalMetricListItem {
}
type Count {
name: String!
name: String!
count: Int!
}
type User {
username: String!
name: String!
email: String!
name: String!
email: String!
}
input MetricStatItem {
@@ -238,25 +319,93 @@ input MetricStatItem {
}
type Query {
clusters: [Cluster!]! # List of all clusters
tags: [Tag!]! # List of all tags
globalMetrics: [GlobalMetricListItem!]!
clusters: [Cluster!]! # List of all clusters
tags: [Tag!]! # List of all tags
globalMetrics: [GlobalMetricListItem!]!
user(username: String!): User
allocatedNodes(cluster: String!): [Count!]!
## Node Queries New
node(id: ID!): Node
nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
nodesWithMeta(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
nodeStates(filter: [NodeFilter!]): [NodeStates!]!
nodeStatesTimed(filter: [NodeFilter!], type: String!): [NodeStatesTimed!]!
job(id: ID!): Job
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!], resolution: Int): [JobMetricWithName!]!
jobMetricStats(id: ID!, metrics: [String!]): [JobMetricStatWithName!]!
jobMetrics(
id: ID!
metrics: [String!]
scopes: [MetricScope!]
resolution: Int
): [JobMetricWithName!]!
jobStats(id: ID!, metrics: [String!]): [NamedStats!]!
scopedJobStats(
id: ID!
metrics: [String!]
scopes: [MetricScope!]
): [NamedStatsWithScope!]!
jobs(
filter: [JobFilter!]
page: PageRequest
order: OrderByInput
): JobResultList!
jobsStatistics(
filter: [JobFilter!]
metrics: [String!]
page: PageRequest
sortBy: SortByAggregate
groupBy: Aggregate
numDurationBins: String
numMetricBins: Int
): [JobsStatistics!]!
jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]!
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate, numDurationBins: String, numMetricBins: Int): [JobsStatistics!]!
rooflineHeatmap(
filter: [JobFilter!]!
rows: Int!
cols: Int!
minX: Float!
minY: Float!
maxX: Float!
maxY: Float!
): [[Float!]!]!
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
nodeMetrics(
cluster: String!
nodes: [String!]
scopes: [MetricScope!]
metrics: [String!]
from: Time!
to: Time!
): [NodeMetrics!]!
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
nodeMetricsList(cluster: String!, subCluster: String!, nodeFilter: String!, scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!, page: PageRequest, resolution: Int): NodesResultList!
nodeMetricsList(
cluster: String!
subCluster: String!
stateFilter: String!
nodeFilter: String!
scopes: [MetricScope!]
metrics: [String!]
from: Time!
to: Time!
page: PageRequest
resolution: Int
): NodesResultList!
clusterMetrics(
cluster: String!
metrics: [String!]
from: Time!
to: Time!
): ClusterMetrics!
}
type Mutation {
@@ -264,41 +413,61 @@ type Mutation {
deleteTag(id: ID!): ID!
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagFromList(tagIds: [ID!]!): [Int!]!
updateConfiguration(name: String!, value: String!): String
}
type IntRangeOutput { from: Int!, to: Int! }
type TimeRangeOutput { range: String, from: Time!, to: Time! }
type IntRangeOutput {
from: Int!
to: Int!
}
type TimeRangeOutput {
range: String
from: Time!
to: Time!
}
input NodeFilter {
hostname: StringInput
cluster: StringInput
subCluster: StringInput
schedulerState: SchedulerState
healthState: MonitoringState
timeStart: Int
}
input JobFilter {
tags: [ID!]
jobId: StringInput
arrayJobId: Int
user: StringInput
project: StringInput
jobName: StringInput
cluster: StringInput
partition: StringInput
duration: IntRange
energy: FloatRange
tags: [ID!]
dbId: [ID!]
jobId: StringInput
arrayJobId: Int
user: StringInput
project: StringInput
jobName: StringInput
cluster: StringInput
subCluster: StringInput
partition: StringInput
duration: IntRange
energy: FloatRange
minRunningFor: Int
numNodes: IntRange
numNodes: IntRange
numAccelerators: IntRange
numHWThreads: IntRange
numHWThreads: IntRange
startTime: TimeRange
state: [JobState!]
startTime: TimeRange
state: [JobState!]
metricStats: [MetricStatItem!]
exclusive: Int
node: StringInput
shared: String
schedule: String
node: StringInput
}
input OrderByInput {
field: String!
type: String!,
type: String!
order: SortDirectionEnum! = ASC
}
@@ -308,34 +477,46 @@ enum SortDirectionEnum {
}
input StringInput {
eq: String
neq: String
contains: String
eq: String
neq: String
contains: String
startsWith: String
endsWith: String
in: [String!]
endsWith: String
in: [String!]
}
input IntRange { from: Int!, to: Int! }
input TimeRange { range: String, from: Time, to: Time }
input IntRange {
from: Int!
to: Int!
}
input TimeRange {
range: String
from: Time
to: Time
}
input FloatRange {
from: Float!
to: Float!
}
type NodeStateResultList {
items: [Node!]!
count: Int
}
type JobResultList {
items: [Job!]!
items: [Job!]!
offset: Int
limit: Int
count: Int
limit: Int
count: Int
hasNextPage: Boolean
}
type JobLinkResultList {
listQuery: String
items: [JobLink!]!
count: Int
items: [JobLink!]!
count: Int
}
type HistoPoint {
@@ -357,27 +538,28 @@ type MetricHistoPoint {
max: Int
}
type JobsStatistics {
id: ID! # If `groupBy` was used, ID of the user/project/cluster
name: String! # if User-Statistics: Given Name of Account (ID) Owner
totalJobs: Int! # Number of jobs
runningJobs: Int! # Number of running jobs
shortJobs: Int! # Number of jobs with a duration of less than duration
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
totalNodes: Int! # Sum of the nodes of all matched jobs
totalNodeHours: Int! # Sum of the node hours of all matched jobs
totalCores: Int! # Sum of the cores of all matched jobs
totalCoreHours: Int! # Sum of the core hours of all matched jobs
totalAccs: Int! # Sum of the accs of all matched jobs
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
type JobsStatistics {
id: ID! # If `groupBy` was used, ID of the user/project/cluster/subcluster
name: String! # if User-Statistics: Given Name of Account (ID) Owner
totalUsers: Int! # if *not* User-Statistics: Number of active users (based on running jobs)
totalJobs: Int! # Number of jobs
runningJobs: Int! # Number of running jobs
shortJobs: Int! # Number of jobs with a duration of less than config'd ShortRunningJobsDuration
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
totalNodes: Int! # Sum of the nodes of all matched jobs
totalNodeHours: Int! # Sum of the node hours of all matched jobs
totalCores: Int! # Sum of the cores of all matched jobs
totalCoreHours: Int! # Sum of the core hours of all matched jobs
totalAccs: Int! # Sum of the accs of all matched jobs
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
}
input PageRequest {
itemsPerPage: Int!
page: Int!
page: Int!
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,18 +1,22 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package main provides the entry point for the ClusterCockpit backend server.
// This file defines all command-line flags and their default values.
package main
import "flag"
var (
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags bool
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
)
func cliInit() {
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize swlite database file, config.json and .env")
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize sqlite database file, config.json and .env")
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'hpc_user' table with ldap")
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
@@ -21,13 +25,14 @@ func cliInit() {
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: `<username>:[admin,support,manager,api,user]:<password>`")
flag.StringVar(&flagDelUser, "del-user", "", "Remove user by `username`")
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")
flag.StringVar(&flagDelUser, "del-user", "", "Remove a existing user. Argument format: <username>")
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug,info,warn (default),err,fatal,crit]`")
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info , warn (default), err, crit]`")
flag.Parse()
}

View File

@@ -1,16 +1,21 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package main provides the entry point for the ClusterCockpit backend server.
// This file contains bootstrap logic for initializing the environment,
// creating default configuration files, and setting up the database.
package main
import (
"fmt"
"encoding/json"
"os"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/util"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/util"
)
const envString = `
@@ -25,61 +30,65 @@ SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
const configString = `
{
"main": {
"addr": "127.0.0.1:8080",
"archive": {
"kind": "file",
"path": "./var/job-archive"
"short-running-jobs-duration": 300,
"resampling": {
"minimum-points": 600,
"trigger": 300,
"resolutions": [
240,
60
]
},
"api-allowed-ips": [
"*"
],
"emission-constant": 317
},
"cron": {
"commit-job-worker": "1m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2000h"
},
"clusters": [
{
"name": "name",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2023-01-01T00:00:00Z",
"to": null
}
}
}
]
"max-age": "2000h"
}
}
}
`
func initEnv() {
if util.CheckFileExists("var") {
fmt.Print("Directory ./var already exists. Exiting!\n")
os.Exit(0)
cclog.Exit("Directory ./var already exists. Cautiously exiting application initialization.")
}
if err := os.WriteFile("config.json", []byte(configString), 0o666); err != nil {
log.Fatalf("Writing config.json failed: %s", err.Error())
cclog.Abortf("Could not write default ./config.json with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
if err := os.WriteFile(".env", []byte(envString), 0o666); err != nil {
log.Fatalf("Writing .env failed: %s", err.Error())
cclog.Abortf("Could not write default ./.env file with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
if err := os.Mkdir("var", 0o777); err != nil {
log.Fatalf("Mkdir var failed: %s", err.Error())
cclog.Abortf("Could not create default ./var folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
err := repository.MigrateDB("sqlite3", "./var/job.db")
err := repository.MigrateDB("./var/job.db")
if err != nil {
log.Fatalf("Initialize job.db failed: %s", err.Error())
cclog.Abortf("Could not initialize default SQLite database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
if err := os.Mkdir("var/job-archive", 0o777); err != nil {
cclog.Abortf("Could not create default ./var/job-archive folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
archiveCfg := "{\"kind\": \"file\",\"path\": \"./var/job-archive\"}"
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
cclog.Abortf("Could not initialize job-archive, exited.\nError: %s\n", err.Error())
}
}

View File

@@ -1,10 +1,16 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package main provides the entry point for the ClusterCockpit backend server.
// It orchestrates initialization of all subsystems including configuration,
// database, authentication, and the HTTP server.
package main
import (
"context"
"encoding/json"
"fmt"
"os"
"os/signal"
@@ -12,21 +18,28 @@ import (
"strings"
"sync"
"syscall"
"time"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/taskManager"
"github.com/ClusterCockpit/cc-backend/internal/tagger"
"github.com/ClusterCockpit/cc-backend/internal/taskmanager"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
"github.com/ClusterCockpit/cc-backend/web"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/nats"
"github.com/ClusterCockpit/cc-lib/v2/runtime"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/google/gops/agent"
"github.com/joho/godotenv"
_ "github.com/go-sql-driver/mysql"
_ "github.com/mattn/go-sqlite3"
)
@@ -39,198 +52,467 @@ const logoString = `
|_|
`
// Environment variable names
const (
envGOGC = "GOGC"
)
// Default configurations
const (
defaultArchiveConfig = `{"kind":"file","path":"./var/job-archive"}`
)
var (
date string
commit string
version string
)
func main() {
cliInit()
func printVersion() {
fmt.Print(logoString)
fmt.Printf("Version:\t%s\n", version)
fmt.Printf("Git hash:\t%s\n", commit)
fmt.Printf("Build time:\t%s\n", date)
fmt.Printf("SQL db version:\t%d\n", repository.Version)
fmt.Printf("Job archive version:\t%d\n", archive.Version)
}
if flagVersion {
fmt.Print(logoString)
fmt.Printf("Version:\t%s\n", version)
fmt.Printf("Git hash:\t%s\n", commit)
fmt.Printf("Build time:\t%s\n", date)
fmt.Printf("SQL db version:\t%d\n", repository.Version)
fmt.Printf("Job archive version:\t%d\n", archive.Version)
os.Exit(0)
func initGops() error {
if !flagGops {
return nil
}
// Apply config flags for pkg/log
log.Init(flagLogLevel, flagLogDateTime)
if err := agent.Listen(agent.Options{}); err != nil {
return fmt.Errorf("starting gops agent: %w", err)
}
return nil
}
// See https://github.com/google/gops (Runtime overhead is almost zero)
if flagGops {
if err := agent.Listen(agent.Options{}); err != nil {
log.Fatalf("gops/agent.Listen failed: %s", err.Error())
}
func loadEnvironment() error {
if err := godotenv.Load(); err != nil {
return fmt.Errorf("loading .env file: %w", err)
}
return nil
}
func initConfiguration() error {
ccconf.Init(flagConfigFile)
cfg := ccconf.GetPackageConfig("main")
if cfg == nil {
return fmt.Errorf("main configuration must be present")
}
if err := runtimeEnv.LoadEnv("./.env"); err != nil && !os.IsNotExist(err) {
log.Fatalf("parsing './.env' file failed: %s", err.Error())
}
config.Init(cfg)
return nil
}
// Initialize sub-modules and handle command line flags.
// The order here is important!
config.Init(flagConfigFile)
// As a special case for `db`, allow using an environment variable instead of the value
// stored in the config. This can be done for people having security concerns about storing
// the password for their mysql database in config.json.
if strings.HasPrefix(config.Keys.DB, "env:") {
envvar := strings.TrimPrefix(config.Keys.DB, "env:")
config.Keys.DB = os.Getenv(envvar)
}
func initDatabase() error {
repository.Connect(config.Keys.DB)
return nil
}
func handleDatabaseCommands() error {
if flagMigrateDB {
err := repository.MigrateDB(config.Keys.DBDriver, config.Keys.DB)
err := repository.MigrateDB(config.Keys.DB)
if err != nil {
log.Fatal(err)
return fmt.Errorf("migrating database to version %d: %w", repository.Version, err)
}
os.Exit(0)
cclog.Exitf("MigrateDB Success: Migrated SQLite database at '%s' to version %d.\n",
config.Keys.DB, repository.Version)
}
if flagRevertDB {
err := repository.RevertDB(config.Keys.DBDriver, config.Keys.DB)
err := repository.RevertDB(config.Keys.DB)
if err != nil {
log.Fatal(err)
return fmt.Errorf("reverting database to version %d: %w", repository.Version-1, err)
}
os.Exit(0)
cclog.Exitf("RevertDB Success: Reverted SQLite database at '%s' to version %d.\n",
config.Keys.DB, repository.Version-1)
}
if flagForceDB {
err := repository.ForceDB(config.Keys.DBDriver, config.Keys.DB)
err := repository.ForceDB(config.Keys.DB)
if err != nil {
log.Fatal(err)
return fmt.Errorf("forcing database to version %d: %w", repository.Version, err)
}
os.Exit(0)
cclog.Exitf("ForceDB Success: Forced SQLite database at '%s' to version %d.\n",
config.Keys.DB, repository.Version)
}
repository.Connect(config.Keys.DBDriver, config.Keys.DB)
return nil
}
if flagInit {
initEnv()
fmt.Print("Successfully setup environment!\n")
fmt.Print("Please review config.json and .env and adjust it to your needs.\n")
fmt.Print("Add your job-archive at ./var/job-archive.\n")
os.Exit(0)
func handleUserCommands() error {
if config.Keys.DisableAuthentication && (flagNewUser != "" || flagDelUser != "") {
return fmt.Errorf("--add-user and --del-user can only be used if authentication is enabled")
}
if !config.Keys.DisableAuthentication {
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
auth.Init(&cfg)
} else {
cclog.Warn("Authentication disabled due to missing configuration")
auth.Init(nil)
}
auth.Init()
// Check for default security keys
checkDefaultSecurityKeys()
if flagNewUser != "" {
parts := strings.SplitN(flagNewUser, ":", 3)
if len(parts) != 3 || len(parts[0]) == 0 {
log.Fatal("invalid argument format for user creation")
}
ur := repository.GetUserRepository()
if err := ur.AddUser(&schema.User{
Username: parts[0], Projects: make([]string, 0), Password: parts[2], Roles: strings.Split(parts[1], ","),
}); err != nil {
log.Fatalf("adding '%s' user authentication failed: %v", parts[0], err)
if err := addUser(flagNewUser); err != nil {
return err
}
}
if flagDelUser != "" {
ur := repository.GetUserRepository()
if err := ur.DelUser(flagDelUser); err != nil {
log.Fatalf("deleting user failed: %v", err)
if err := delUser(flagDelUser); err != nil {
return err
}
}
authHandle := auth.GetAuthInstance()
if flagSyncLDAP {
if authHandle.LdapAuth == nil {
log.Fatal("cannot sync: LDAP authentication is not configured")
if err := syncLDAP(authHandle); err != nil {
return err
}
if err := authHandle.LdapAuth.Sync(); err != nil {
log.Fatalf("LDAP sync failed: %v", err)
}
log.Info("LDAP sync successfull")
}
if flagGenJWT != "" {
ur := repository.GetUserRepository()
user, err := ur.GetUser(flagGenJWT)
if err != nil {
log.Fatalf("could not get user from JWT: %v", err)
if err := generateJWT(authHandle, flagGenJWT); err != nil {
return err
}
if !user.HasRole(schema.RoleApi) {
log.Warnf("user '%s' does not have the API role", user.Username)
}
jwt, err := authHandle.JwtAuth.ProvideJWT(user)
if err != nil {
log.Fatalf("failed to provide JWT to user '%s': %v", user.Username, err)
}
fmt.Printf("MAIN > JWT for '%s': %s\n", user.Username, jwt)
}
} else if flagNewUser != "" || flagDelUser != "" {
log.Fatal("arguments --add-user and --del-user can only be used if authentication is enabled")
}
if err := archive.Init(config.Keys.Archive, config.Keys.DisableArchive); err != nil {
log.Fatalf("failed to initialize archive: %s", err.Error())
return nil
}
// checkDefaultSecurityKeys warns if default JWT keys are detected
func checkDefaultSecurityKeys() {
// Default JWT public key from init.go
defaultJWTPublic := "kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
if os.Getenv("JWT_PUBLIC_KEY") == defaultJWTPublic {
cclog.Warn("Using default JWT keys - not recommended for production environments")
}
}
func addUser(userSpec string) error {
parts := strings.SplitN(userSpec, ":", 3)
if len(parts) != 3 || len(parts[0]) == 0 {
return fmt.Errorf("invalid user format, want: <username>:[admin,support,manager,api,user]:<password>, have: %s", userSpec)
}
if err := metricdata.Init(); err != nil {
log.Fatalf("failed to initialize metricdata repository: %s", err.Error())
ur := repository.GetUserRepository()
if err := ur.AddUser(&schema.User{
Username: parts[0],
Projects: make([]string, 0),
Password: parts[2],
Roles: strings.Split(parts[1], ","),
}); err != nil {
return fmt.Errorf("adding user '%s' with roles '%s': %w", parts[0], parts[1], err)
}
cclog.Infof("Add User: Added new user '%s' with roles '%s'", parts[0], parts[1])
return nil
}
func delUser(username string) error {
ur := repository.GetUserRepository()
if err := ur.DelUser(username); err != nil {
return fmt.Errorf("deleting user '%s': %w", username, err)
}
cclog.Infof("Delete User: Deleted user '%s' from DB", username)
return nil
}
func syncLDAP(authHandle *auth.Authentication) error {
if authHandle.LdapAuth == nil {
return fmt.Errorf("LDAP authentication is not configured")
}
if err := authHandle.LdapAuth.Sync(); err != nil {
return fmt.Errorf("synchronizing LDAP: %w", err)
}
cclog.Print("Sync LDAP: LDAP synchronization successfull.")
return nil
}
func generateJWT(authHandle *auth.Authentication, username string) error {
ur := repository.GetUserRepository()
user, err := ur.GetUser(username)
if err != nil {
return fmt.Errorf("getting user '%s': %w", username, err)
}
if !user.HasRole(schema.RoleAPI) {
cclog.Warnf("JWT: User '%s' does not have the role 'api'. REST API endpoints will return error!\n", user.Username)
}
jwt, err := authHandle.JwtAuth.ProvideJWT(user)
if err != nil {
return fmt.Errorf("generating JWT for user '%s': %w", user.Username, err)
}
cclog.Printf("JWT: Successfully generated JWT for user '%s': %s\n", user.Username, jwt)
return nil
}
func initSubsystems() error {
// Initialize nats client
natsConfig := ccconf.GetPackageConfig("nats")
if err := nats.Init(natsConfig); err != nil {
cclog.Warnf("initializing (optional) nats client: %s", err.Error())
}
nats.Connect()
// Initialize job archive
archiveCfg := ccconf.GetPackageConfig("archive")
if archiveCfg == nil {
cclog.Debug("Archive configuration not found, using default archive configuration")
archiveCfg = json.RawMessage(defaultArchiveConfig)
}
if err := archive.Init(archiveCfg); err != nil {
return fmt.Errorf("initializing archive: %w", err)
}
// Handle database re-initialization
if flagReinitDB {
if err := importer.InitDB(); err != nil {
log.Fatalf("failed to re-initialize repository DB: %s", err.Error())
return fmt.Errorf("re-initializing repository DB: %w", err)
}
cclog.Print("Init DB: Successfully re-initialized repository DB.")
}
// Handle job import
if flagImportJob != "" {
if err := importer.HandleImportFlag(flagImportJob); err != nil {
log.Fatalf("job import failed: %s", err.Error())
return fmt.Errorf("importing job: %w", err)
}
cclog.Infof("Import Job: Imported Job '%s' into DB", flagImportJob)
}
// Initialize taggers
if config.Keys.EnableJobTaggers {
tagger.Init()
}
// Apply tags if requested
if flagApplyTags {
tagger.Init()
if err := tagger.RunTaggers(); err != nil {
return fmt.Errorf("running job taggers: %w", err)
}
}
if !flagServer {
return
}
archiver.Start(repository.GetJobRepository())
taskManager.Start()
serverInit()
return nil
}
func runServer(ctx context.Context) error {
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
serverStart()
}()
// Initialize metric store if configuration is provided
haveMetricstore := false
mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg != nil {
metrics := metricstore.BuildMetricList()
metricstore.Init(mscfg, metrics, &wg)
// Inject repository as NodeProvider to break import cycle
ms := metricstore.GetMemoryStore()
jobRepo := repository.GetJobRepository()
ms.SetNodeProvider(jobRepo)
metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{}
haveMetricstore = true
} else {
metricstore.MetricStoreHandle = nil
cclog.Debug("missing internal metricstore configuration")
}
// Initialize external metric stores if configuration is provided
mscfg = ccconf.GetPackageConfig("metric-store-external")
if mscfg != nil {
err := metricdispatch.Init(mscfg)
if err != nil {
cclog.Debugf("initializing metricdispatch: %v", err)
} else {
haveMetricstore = true
}
}
if !haveMetricstore {
return fmt.Errorf("missing metricstore configuration")
}
// Start archiver and task manager
archiver.Start(repository.GetJobRepository(), ctx)
taskmanager.Start(ccconf.GetPackageConfig("cron"), ccconf.GetPackageConfig("archive"))
// Initialize web UI
cfg := ccconf.GetPackageConfig("ui")
if err := web.Init(cfg); err != nil {
return fmt.Errorf("initializing web UI: %w", err)
}
// Initialize HTTP server
srv, err := NewServer(version, commit, date)
if err != nil {
return fmt.Errorf("creating server: %w", err)
}
// Channel to collect errors from server
errChan := make(chan error, 1)
// Start HTTP server
wg.Go(func() {
if err := srv.Start(ctx); err != nil {
errChan <- err
}
})
// Handle shutdown signals
wg.Add(1)
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
go func() {
defer wg.Done()
<-sigs
runtimeEnv.SystemdNotifiy(false, "Shutting down ...")
select {
case <-sigs:
cclog.Info("Shutdown signal received")
case <-ctx.Done():
}
serverShutdown()
taskManager.Shutdown()
runtime.SystemdNotify(false, "Shutting down ...")
srv.Shutdown(ctx)
util.FsWatcherShutdown()
taskmanager.Shutdown()
}()
if os.Getenv("GOGC") == "" {
debug.SetGCPercent(25)
// Set GC percent if not configured
if os.Getenv(envGOGC) == "" {
debug.SetGCPercent(15)
}
runtime.SystemdNotify(true, "running")
waitDone := make(chan struct{})
go func() {
wg.Wait()
close(waitDone)
}()
go func() {
<-waitDone
close(errChan)
}()
// Wait for either:
// 1. An error from server startup
// 2. Completion of all goroutines (normal shutdown or crash)
select {
case err := <-errChan:
// errChan will be closed when waitDone is closed, which happens
// when all goroutines complete (either from normal shutdown or error)
if err != nil {
return err
}
case <-time.After(100 * time.Millisecond):
// Give the server 100ms to start and report any immediate startup errors
// After that, just wait for normal shutdown completion
select {
case err := <-errChan:
if err != nil {
return err
}
case <-waitDone:
// Normal shutdown completed
}
}
cclog.Print("Graceful shutdown completed!")
return nil
}
func run() error {
cliInit()
if flagVersion {
printVersion()
return nil
}
// Initialize logger
cclog.Init(flagLogLevel, flagLogDateTime)
// Handle init flag
if flagInit {
initEnv()
cclog.Exit("Successfully setup environment!\n" +
"Please review config.json and .env and adjust it to your needs.\n" +
"Add your job-archive at ./var/job-archive.")
}
// Initialize gops agent
if err := initGops(); err != nil {
return err
}
// Initialize subsystems in dependency order:
// 1. Load environment variables from .env file (contains sensitive configuration)
// 2. Load configuration from config.json (may reference environment variables)
// 3. Handle database migration commands if requested
// 4. Initialize database connection (requires config for connection string)
// 5. Handle user commands if requested (requires database and authentication config)
// 6. Initialize subsystems like archive and metrics (require config and database)
// Load environment and configuration
if err := loadEnvironment(); err != nil {
return err
}
if err := initConfiguration(); err != nil {
return err
}
// Handle database migration (migrate, revert, force)
if err := handleDatabaseCommands(); err != nil {
return err
}
// Initialize database
if err := initDatabase(); err != nil {
return err
}
// Handle user commands (add, delete, sync, JWT)
if err := handleUserCommands(); err != nil {
return err
}
// Initialize subsystems (archive, metrics, taggers)
if err := initSubsystems(); err != nil {
return err
}
// Exit if start server is not requested
if !flagServer {
cclog.Exit("No errors, server flag not set. Exiting cc-backend.")
}
// Run server with context
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
return runServer(ctx)
}
func main() {
if err := run(); err != nil {
cclog.Error(err.Error())
os.Exit(1)
}
runtimeEnv.SystemdNotifiy(true, "running")
wg.Wait()
log.Print("Graceful shutdown completed!")
}

View File

@@ -1,7 +1,11 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package main provides the entry point for the ClusterCockpit backend server.
// This file contains HTTP server setup, routing configuration, and
// authentication middleware integration.
package main
import (
@@ -10,7 +14,6 @@ import (
"encoding/json"
"errors"
"fmt"
"io"
"net"
"net/http"
"os"
@@ -18,6 +21,7 @@ import (
"time"
"github.com/99designs/gqlgen/graphql/handler"
"github.com/99designs/gqlgen/graphql/handler/transport"
"github.com/99designs/gqlgen/graphql/playground"
"github.com/ClusterCockpit/cc-backend/internal/api"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
@@ -26,20 +30,32 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
"github.com/ClusterCockpit/cc-backend/web"
"github.com/gorilla/handlers"
"github.com/gorilla/mux"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/nats"
"github.com/ClusterCockpit/cc-lib/v2/runtime"
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
"github.com/go-chi/cors"
httpSwagger "github.com/swaggo/http-swagger"
)
var (
router *mux.Router
server *http.Server
apiHandle *api.RestApi
var buildInfo web.Build
// Environment variable names
const (
envDebug = "DEBUG"
)
// Server encapsulates the HTTP server state and dependencies
type Server struct {
router chi.Router
server *http.Server
restAPIHandle *api.RestAPI
natsAPIHandle *api.NatsAPI
}
func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusUnauthorized)
@@ -49,22 +65,39 @@ func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
})
}
func serverInit() {
// NewServer creates and initializes a new Server instance
func NewServer(version, commit, buildDate string) (*Server, error) {
buildInfo = web.Build{Version: version, Hash: commit, Buildtime: buildDate}
s := &Server{
router: chi.NewRouter(),
}
if err := s.init(); err != nil {
return nil, err
}
return s, nil
}
func (s *Server) init() error {
// Setup the http.Handler/Router used by the server
graph.Init()
resolver := graph.GetResolverInstance()
graphQLEndpoint := handler.NewDefaultServer(
graphQLServer := handler.New(
generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
if os.Getenv("DEBUG") != "1" {
graphQLServer.AddTransport(transport.POST{})
if os.Getenv(envDebug) != "1" {
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
// The problem with this is that then, no more stacktrace is printed to stderr.
graphQLEndpoint.SetRecoverFunc(func(ctx context.Context, err interface{}) error {
graphQLServer.SetRecoverFunc(func(ctx context.Context, err any) error {
switch e := err.(type) {
case string:
return fmt.Errorf("MAIN > Panic: %s", e)
case error:
return fmt.Errorf("MAIN > Panic caused by: %w", e)
return fmt.Errorf("MAIN > Panic caused by: %s", e.Error())
}
return errors.New("MAIN > Internal server error (panic)")
@@ -73,72 +106,70 @@ func serverInit() {
authHandle := auth.GetAuthInstance()
apiHandle = api.New()
// Middleware must be defined before routes in chi
s.router.Use(func(next http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
start := time.Now()
ww := middleware.NewWrapResponseWriter(rw, r.ProtoMajor)
next.ServeHTTP(ww, r)
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
r.Method, r.URL.RequestURI(),
ww.Status(), float32(ww.BytesWritten())/1024,
time.Since(start).Milliseconds())
})
})
s.router.Use(middleware.Compress(5))
s.router.Use(middleware.Recoverer)
s.router.Use(cors.Handler(cors.Options{
AllowCredentials: true,
AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"},
AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"},
AllowedOrigins: []string{"*"},
}))
router = mux.NewRouter()
buildInfo := web.Build{Version: version, Hash: commit, Buildtime: date}
s.restAPIHandle = api.New()
info := map[string]interface{}{}
info := map[string]any{}
info["hasOpenIDConnect"] = false
if config.Keys.OpenIDConfig != nil {
if auth.Keys.OpenIDConfig != nil {
openIDConnect := auth.NewOIDC(authHandle)
openIDConnect.RegisterEndpoints(router)
openIDConnect.RegisterEndpoints(s.router)
info["hasOpenIDConnect"] = true
}
router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
s.router.Get("/login", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
log.Debugf("##%v##", info)
cclog.Debugf("##%v##", info)
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
}).Methods(http.MethodGet)
router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
})
s.router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
})
router.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
s.router.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
})
secured := router.PathPrefix("/").Subrouter()
securedapi := router.PathPrefix("/api").Subrouter()
userapi := router.PathPrefix("/userapi").Subrouter()
configapi := router.PathPrefix("/config").Subrouter()
frontendapi := router.PathPrefix("/frontend").Subrouter()
if !config.Keys.DisableAuthentication {
router.Handle("/login", authHandle.Login(
// On success: Handled within Login()
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Login failed - ClusterCockpit",
MsgType: "alert-warning",
Message: err.Error(),
Build: buildInfo,
Infos: info,
})
})).Methods(http.MethodPost)
// Create login failure handler (used by both /login and /jwt-login)
loginFailureHandler := func(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Login failed - ClusterCockpit",
MsgType: "alert-warning",
Message: err.Error(),
Build: buildInfo,
Infos: info,
})
}
router.Handle("/jwt-login", authHandle.Login(
// On success: Handled within Login()
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Login failed - ClusterCockpit",
MsgType: "alert-warning",
Message: err.Error(),
Build: buildInfo,
Infos: info,
})
}))
s.router.Post("/login", authHandle.Login(loginFailureHandler).ServeHTTP)
s.router.HandleFunc("/jwt-login", authHandle.Login(loginFailureHandler).ServeHTTP)
router.Handle("/logout", authHandle.Logout(
s.router.Post("/logout", authHandle.Logout(
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusOK)
@@ -149,139 +180,196 @@ func serverInit() {
Build: buildInfo,
Infos: info,
})
}))).Methods(http.MethodPost)
secured.Use(func(next http.Handler) http.Handler {
return authHandle.Auth(
// On success;
next,
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Authentication failed - ClusterCockpit",
MsgType: "alert-danger",
Message: err.Error(),
Build: buildInfo,
Infos: info,
Redirect: r.RequestURI,
})
})
})
securedapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthApi(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
userapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthUserApi(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
configapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthConfigApi(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
frontendapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthFrontendApi(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
})).ServeHTTP)
}
if flagDev {
router.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
router.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
s.router.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
s.router.Get("/swagger/*", httpSwagger.Handler(
httpSwagger.URL("http://"+config.Keys.Addr+"/swagger/doc.json")))
}
secured.Handle("/query", graphQLEndpoint)
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
routerConfig.HandleSearchBar(rw, r, buildInfo)
// Secured routes (require authentication)
s.router.Group(func(secured chi.Router) {
if !config.Keys.DisableAuthentication {
secured.Use(func(next http.Handler) http.Handler {
return authHandle.Auth(
next,
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Authentication failed - ClusterCockpit",
MsgType: "alert-danger",
Message: err.Error(),
Build: buildInfo,
Infos: info,
Redirect: r.RequestURI,
})
})
})
}
secured.Handle("/query", graphQLServer)
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
routerConfig.HandleSearchBar(rw, r, buildInfo)
})
routerConfig.SetupRoutes(secured, buildInfo)
})
// Mount all /monitoring/... and /api/... routes.
routerConfig.SetupRoutes(secured, buildInfo)
apiHandle.MountApiRoutes(securedapi)
apiHandle.MountUserApiRoutes(userapi)
apiHandle.MountConfigApiRoutes(configapi)
apiHandle.MountFrontendApiRoutes(frontendapi)
// API routes (JWT token auth)
s.router.Route("/api", func(apiRouter chi.Router) {
// Main API routes with API auth
apiRouter.Group(func(securedapi chi.Router) {
if !config.Keys.DisableAuthentication {
securedapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthAPI(next, onFailureResponse)
})
}
s.restAPIHandle.MountAPIRoutes(securedapi)
})
// Metric store API routes with separate auth
apiRouter.Group(func(metricstoreapi chi.Router) {
if !config.Keys.DisableAuthentication {
metricstoreapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthMetricStoreAPI(next, onFailureResponse)
})
}
s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi)
})
})
// User API routes
s.router.Route("/userapi", func(userapi chi.Router) {
if !config.Keys.DisableAuthentication {
userapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthUserAPI(next, onFailureResponse)
})
}
s.restAPIHandle.MountUserAPIRoutes(userapi)
})
// Config API routes (uses Group with full paths to avoid shadowing
// the /config page route that is registered in the secured group)
s.router.Group(func(configapi chi.Router) {
if !config.Keys.DisableAuthentication {
configapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthConfigAPI(next, onFailureResponse)
})
}
s.restAPIHandle.MountConfigAPIRoutes(configapi)
})
// Frontend API routes
s.router.Route("/frontend", func(frontendapi chi.Router) {
if !config.Keys.DisableAuthentication {
frontendapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthFrontendAPI(next, onFailureResponse)
})
}
s.restAPIHandle.MountFrontendAPIRoutes(frontendapi)
})
if config.Keys.APISubjects != nil {
s.natsAPIHandle = api.NewNatsAPI()
if err := s.natsAPIHandle.StartSubscriptions(); err != nil {
return fmt.Errorf("starting NATS subscriptions: %w", err)
}
}
// 404 handler for pages and API routes
notFoundHandler := func(rw http.ResponseWriter, r *http.Request) {
if strings.HasPrefix(r.URL.Path, "/api/") || strings.HasPrefix(r.URL.Path, "/userapi/") ||
strings.HasPrefix(r.URL.Path, "/frontend/") || strings.HasPrefix(r.URL.Path, "/config/") {
rw.Header().Set("Content-Type", "application/json")
rw.WriteHeader(http.StatusNotFound)
json.NewEncoder(rw).Encode(map[string]string{
"status": "Resource not found",
"error": "the requested endpoint does not exist",
})
return
}
rw.Header().Set("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusNotFound)
web.RenderTemplate(rw, "404.tmpl", &web.Page{
Title: "Page Not Found",
Build: buildInfo,
})
}
// Set NotFound on the router so chi uses it for all unmatched routes,
// including those under subrouters like /api, /userapi, /frontend, etc.
s.router.NotFound(notFoundHandler)
if config.Keys.EmbedStaticFiles {
if i, err := os.Stat("./var/img"); err == nil {
if i.IsDir() {
log.Info("Use local directory for static images")
router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
cclog.Info("Use local directory for static images")
s.router.Handle("/img/*", http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
}
}
router.PathPrefix("/").Handler(web.ServeFiles())
fileServer := http.StripPrefix("/", web.ServeFiles())
s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
if web.StaticFileExists(r.URL.Path) {
fileServer.ServeHTTP(rw, r)
return
}
notFoundHandler(rw, r)
}))
} else {
router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
staticDir := http.Dir(config.Keys.StaticFiles)
fileServer := http.FileServer(staticDir)
s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
f, err := staticDir.Open(r.URL.Path)
if err == nil {
f.Close()
fileServer.ServeHTTP(rw, r)
return
}
notFoundHandler(rw, r)
}))
}
router.Use(handlers.CompressHandler)
router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
router.Use(handlers.CORS(
handlers.AllowCredentials(),
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
handlers.AllowedOrigins([]string{"*"})))
return nil
}
func serverStart() {
handler := handlers.CustomLoggingHandler(io.Discard, router, func(_ io.Writer, params handlers.LogFormatterParams) {
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
log.Debugf("%s %s (%d, %.02fkb, %dms)",
params.Request.Method, params.URL.RequestURI(),
params.StatusCode, float32(params.Size)/1024,
time.Since(params.TimeStamp).Milliseconds())
} else {
log.Debugf("%s %s (%d, %.02fkb, %dms)",
params.Request.Method, params.URL.RequestURI(),
params.StatusCode, float32(params.Size)/1024,
time.Since(params.TimeStamp).Milliseconds())
}
})
// Server timeout defaults (in seconds)
const (
defaultReadTimeout = 20
defaultWriteTimeout = 20
)
server = &http.Server{
ReadTimeout: 20 * time.Second,
WriteTimeout: 20 * time.Second,
Handler: handler,
func (s *Server) Start(ctx context.Context) error {
// Use configurable timeouts with defaults
readTimeout := time.Duration(defaultReadTimeout) * time.Second
writeTimeout := time.Duration(defaultWriteTimeout) * time.Second
s.server = &http.Server{
ReadTimeout: readTimeout,
WriteTimeout: writeTimeout,
Handler: s.router,
Addr: config.Keys.Addr,
}
// Start http or https server
listener, err := net.Listen("tcp", config.Keys.Addr)
if err != nil {
log.Fatalf("starting http listener failed: %v", err)
return fmt.Errorf("starting listener on '%s': %w", config.Keys.Addr, err)
}
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHttpTo != "" {
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHTTPTo != "" {
go func() {
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHttpTo, http.StatusMovedPermanently))
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently))
}()
}
if config.Keys.HttpsCertFile != "" && config.Keys.HttpsKeyFile != "" {
if config.Keys.HTTPSCertFile != "" && config.Keys.HTTPSKeyFile != "" {
cert, err := tls.LoadX509KeyPair(
config.Keys.HttpsCertFile, config.Keys.HttpsKeyFile)
config.Keys.HTTPSCertFile, config.Keys.HTTPSKeyFile)
if err != nil {
log.Fatalf("loading X509 keypair failed: %v", err)
return fmt.Errorf("loading X509 keypair (check 'https-cert-file' and 'https-key-file' in config.json): %w", err)
}
listener = tls.NewListener(listener, &tls.Config{
Certificates: []tls.Certificate{cert},
@@ -292,27 +380,54 @@ func serverStart() {
MinVersion: tls.VersionTLS12,
PreferServerCipherSuites: true,
})
fmt.Printf("HTTPS server listening at %s...", config.Keys.Addr)
cclog.Infof("HTTPS server listening at %s...", config.Keys.Addr)
} else {
fmt.Printf("HTTP server listening at %s...", config.Keys.Addr)
cclog.Infof("HTTP server listening at %s...", config.Keys.Addr)
}
//
// Because this program will want to bind to a privileged port (like 80), the listener must
// be established first, then the user can be changed, and after that,
// the actual http server can be started.
if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
log.Fatalf("error while preparing server start: %s", err.Error())
if err := runtime.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
return fmt.Errorf("dropping privileges: %w", err)
}
if err = server.Serve(listener); err != nil && err != http.ErrServerClosed {
log.Fatalf("starting server failed: %v", err)
// Handle context cancellation for graceful shutdown
go func() {
<-ctx.Done()
shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if err := s.server.Shutdown(shutdownCtx); err != nil {
cclog.Errorf("Server shutdown error: %v", err)
}
}()
if err = s.server.Serve(listener); err != nil && err != http.ErrServerClosed {
return fmt.Errorf("server failed: %w", err)
}
return nil
}
func serverShutdown() {
func (s *Server) Shutdown(ctx context.Context) {
// Create a shutdown context with timeout
shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
nc := nats.GetClient()
if nc != nil {
nc.Close()
}
// First shut down the server gracefully (waiting for all ongoing requests)
server.Shutdown(context.Background())
if err := s.server.Shutdown(shutdownCtx); err != nil {
cclog.Errorf("Server shutdown error: %v", err)
}
// Then, wait for any async archivings still pending...
archiver.WaitForArchiving()
// Archive all the metric store data
metricstore.Shutdown()
// Shutdown archiver with 10 second timeout for fast shutdown
if err := archiver.Shutdown(10 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown: %v", err)
}
}

View File

@@ -1,67 +1,29 @@
{
"addr": "127.0.0.1:8080",
"short-running-jobs-duration": 300,
"archive": {
"kind": "file",
"path": "./var/job-archive"
"main": {
"addr": "127.0.0.1:8080"
},
"jwts": {
"max-age": "2000h"
"cron": {
"commit-job-worker": "1m",
"duration-worker": "3m",
"footprint-worker": "5m"
},
"enable-resampling": {
"trigger": 30,
"resolutions": [
600,
300,
120,
60
]
},
"emission-constant": 317,
"clusters": [
{
"name": "fritz",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "alex",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
"auth": {
"jwts": {
"max-age": "2000h"
}
]
},
"metric-store-external": [
{
"scope": "fritz",
"url": "http://0.0.0.0:8082",
"token": "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NzU3Nzg4NDQsImlhdCI6MTc2ODU3ODg0NCwicm9sZXMiOlsiYWRtaW4iLCJhcGkiXSwic3ViIjoiZGVtbyJ9._SDEW9WaUVXSBFmWqGhyIZXLoqoDU8F1hkfh4cXKIqF4yw7w50IUpfUBtwUFUOnoviFKoi563f6RAMC7XxeLDA"
}
],
"metric-store": {
"checkpoints": {
"interval": "12h"
},
"retention-in-memory": "48h",
"memory-cap": 100
}
}

View File

@@ -1,69 +0,0 @@
{
"addr": "127.0.0.1:8080",
"short-running-jobs-duration": 300,
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"jwts": {
"max-age": "2000h"
},
"db-driver": "mysql",
"db": "clustercockpit:demo@tcp(127.0.0.1:3306)/clustercockpit",
"enable-resampling": {
"trigger": 30,
"resolutions": [
600,
300,
120,
60
]
},
"emission-constant": 317,
"clusters": [
{
"name": "fritz",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "alex",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}
]
}

View File

@@ -1,50 +1,99 @@
{
"main": {
"addr": "0.0.0.0:443",
"ldap": {
"url": "ldaps://test",
"user_base": "ou=people,ou=hpc,dc=test,dc=de",
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=test,dc=de",
"user_bind": "uid={username},ou=people,ou=hpc,dc=test,dc=de",
"user_filter": "(&(objectclass=posixAccount))"
},
"https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem",
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
"user": "clustercockpit",
"group": "clustercockpit",
"archive": {
"kind": "file",
"path": "./var/job-archive"
"api-allowed-ips": ["*"],
"short-running-jobs-duration": 300,
"enable-job-taggers": true,
"nodestate-retention": {
"policy": "move",
"target-kind": "file",
"target-path": "./var/nodestate-archive"
},
"validate": true,
"clusters": [
{
"name": "test",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "eyJhbGciOiJF-E-pQBQ"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}
],
"resampling": {
"minimum-points": 600,
"trigger": 180,
"resolutions": [240, 60]
},
"api-subjects": {
"subject-job-event": "cc.job.event",
"subject-node-state": "cc.node.state"
}
},
"nats": {
"address": "nats://0.0.0.0:4222",
"username": "root",
"password": "root"
},
"auth": {
"jwts": {
"cookieName": "",
"validateUser": false,
"max-age": "2000h",
"trustedIssuer": ""
"max-age": "2000h"
}
},
"cron": {
"commit-job-worker": "1m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
"archive": {
"kind": "s3",
"endpoint": "http://x.x.x.x",
"bucket": "jobarchive",
"access-key": "xx",
"secret-key": "xx",
"retention": {
"policy": "move",
"age": 365,
"location": "./var/archive"
}
},
"metric-store-external": [
{
"scope": "*",
"url": "http://x.x.x.x:8082",
"token": "MySecret"
},
"short-running-jobs-duration": 300
{
"scope": "fritz",
"url": "http://x.x.x.x:8084",
"token": "MySecret"
},
{
"scope": "fritz-spr1tb",
"url": "http://x.x.x.x:8083",
"token": "MySecret"
},
{
"scope": "alex",
"url": "http://x.x.x.x:8084",
"token": "MySecret"
}
],
"metric-store": {
"checkpoints": {
"interval": "12h",
"directory": "./var/checkpoints"
},
"memory-cap": 100,
"retention-in-memory": "48h",
"cleanup": {
"mode": "archive",
"interval": "48h",
"directory": "./var/archive"
},
"nats-subscriptions": [
{
"subscribe-to": "hpc-nats",
"cluster-tag": "fritz"
},
{
"subscribe-to": "hpc-nats",
"cluster-tag": "alex"
}
]
},
"ui-file": "ui-config.json"
}

View File

@@ -1,12 +0,0 @@
{
"clusters": [
{
"name": "fritz",
"default_metrics": "cpu_load, flops_any, core_power, lustre_open, mem_used, mem_bw, net_bytes_in"
},
{
"name": "alex",
"default_metrics": "flops_any, mem_bw, mem_used, vectorization_ratio"
}
]
}

View File

@@ -0,0 +1,22 @@
{
"cluster": "fritz",
"jobId": 123000,
"jobState": "running",
"numAcc": 0,
"numHwthreads": 72,
"numNodes": 1,
"partition": "main",
"requestedMemory": 128000,
"resources": [{ "hostname": "f0726" }],
"startTime": 1649723812,
"subCluster": "main",
"submitTime": 1649723812,
"user": "k106eb10",
"project": "k106eb",
"walltime": 86400,
"metaData": {
"slurmInfo": "JobId=398759\nJobName=myJob\nUserId=dummyUser\nGroupId=dummyGroup\nAccount=dummyAccount\nQOS=normal Requeue=False Restarts=0 BatchFlag=True\nTimeLimit=1439'\nSubmitTime=2023-02-09T14:10:18\nPartition=singlenode\nNodeList=xx\nNumNodes=xx NumCPUs=72 NumTasks=72 CPUs/Task=1\nNTasksPerNode:Socket:Core=0:None:None\nTRES_req=cpu=72,mem=250000M,node=1,billing=72\nTRES_alloc=cpu=72,node=1,billing=72\nCommand=myCmd\nWorkDir=myDir\nStdErr=\nStdOut=\n",
"jobScript": "#!/bin/bash -l\n#SBATCH --job-name=dummy_job\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/dummy/\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\n\n#This is a dummy job script\n./mybinary\n",
"jobName": "ams_pipeline"
}
}

View File

@@ -0,0 +1,7 @@
{
"cluster": "fritz",
"jobId": 123000,
"jobState": "completed",
"startTime": 1649723812,
"stopTime": 1649763839
}

419
configs/tagger/README.md Normal file
View File

@@ -0,0 +1,419 @@
# Job Tagging Configuration
ClusterCockpit provides automatic job tagging functionality to classify and
categorize jobs based on configurable rules. The tagging system consists of two
main components:
1. **Application Detection** - Identifies which application a job is running
2. **Job Classification** - Analyzes job performance characteristics and applies classification tags
## Directory Structure
```
configs/tagger/
├── apps/ # Application detection patterns
│ ├── vasp.txt
│ ├── gromacs.txt
│ └── ...
└── jobclasses/ # Job classification rules
├── parameters.json
├── lowUtilization.json
├── highload.json
└── ...
```
## Activating Tagger Rules
### Step 1: Copy Configuration Files
To activate tagging, review, adapt, and copy the configuration files from
`configs/tagger/` to `var/tagger/`:
```bash
# From the cc-backend root directory
mkdir -p var/tagger
cp -r configs/tagger/apps var/tagger/
cp -r configs/tagger/jobclasses var/tagger/
```
### Step 2: Enable Tagging in Configuration
Add or set the following configuration key in the `main` section of your `config.json`:
```json
{
"enable-job-taggers": true
}
```
**Important**: Automatic tagging is disabled by default. You must explicitly
enable it by setting `enable-job-taggers: true` in the main configuration file.
### Step 3: Restart cc-backend
The tagger system automatically loads configuration from `./var/tagger/` at
startup. After copying the files and enabling the feature, restart cc-backend:
```bash
./cc-backend -server
```
### Step 4: Verify Configuration Loaded
Check the logs for messages indicating successful configuration loading:
```
[INFO] Setup file watch for ./var/tagger/apps
[INFO] Setup file watch for ./var/tagger/jobclasses
```
## How Tagging Works
### Automatic Tagging
When `enable-job-taggers` is set to `true` in the configuration, tags are
automatically applied when:
- **Job Start**: Application detection runs immediately when a job starts
- **Job Stop**: Job classification runs when a job completes
The system analyzes job metadata and metrics to determine appropriate tags.
**Note**: Automatic tagging only works for jobs that start or stop after the
feature is enabled. Existing jobs are not automatically retagged.
### Manual Tagging (Retroactive)
To apply tags to existing jobs in the database, use the `-apply-tags` command
line option:
```bash
./cc-backend -apply-tags
```
This processes all jobs in the database and applies current tagging rules. This
is useful when:
- You have existing jobs that were created before tagging was enabled
- You've added new tagging rules and want to apply them to historical data
- You've modified existing rules and want to re-evaluate all jobs
### Hot Reload
The tagger system watches the configuration directories for changes. You can
modify or add rules without restarting `cc-backend`:
- Changes to `var/tagger/apps/*` are detected automatically
- Changes to `var/tagger/jobclasses/*` are detected automatically
## Application Detection
Application detection identifies which software a job is running by matching
patterns in the job script.
### Configuration Format
Application patterns are stored in text files under `var/tagger/apps/`. Each
file contains one or more regular expression patterns (one per line) that match
against the job script.
**Example: `apps/vasp.txt`**
```
vasp
VASP
```
### How It Works
1. When a job starts, the system retrieves the job script from metadata
2. Each line in the app files is treated as a regex pattern
3. Patterns are matched case-insensitively against the lowercased job script
4. If a match is found, a tag of type `app` with the filename (without extension) is applied
5. Only the first matching application is tagged
### Adding New Applications
1. Create a new file in `var/tagger/apps/` (e.g., `tensorflow.txt`)
2. Add regex patterns, one per line:
```
tensorflow
tf\.keras
import tensorflow
```
3. The file is automatically detected and loaded
**Note**: The tag name will be the filename without the `.txt` extension (e.g., `tensorflow`).
## Job Classification
Job classification analyzes completed jobs based on their metrics and properties
to identify performance issues or characteristics.
### Configuration Format
Job classification rules are defined in JSON files under
`var/tagger/jobclasses/`. Each rule file defines:
- **Metrics required**: Which job metrics to analyze
- **Requirements**: Pre-conditions that must be met
- **Variables**: Computed values used in the rule
- **Rule expression**: Boolean expression that determines if the rule matches
- **Hint template**: Message displayed when the rule matches
### Parameters File
`jobclasses/parameters.json` defines shared threshold values used across multiple rules:
```json
{
"lowcpuload_threshold_factor": 0.9,
"highmemoryusage_threshold_factor": 0.9,
"job_min_duration_seconds": 600.0,
"sampling_interval_seconds": 30.0
}
```
### Rule File Structure
**Example: `jobclasses/lowUtilization.json`**
```json
{
"name": "Low resource utilization",
"tag": "lowutilization",
"parameters": ["job_min_duration_seconds"],
"metrics": ["flops_any", "mem_bw"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "mem_bw_perc",
"expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)"
}
],
"rule": "flops_any.avg < flops_any.limits.alert",
"hint": "Average flop rate {{.flops_any.avg}} falls below threshold {{.flops_any.limits.alert}}"
}
```
#### Field Descriptions
| Field | Description |
| -------------- | ----------------------------------------------------------------------------- |
| `name` | Human-readable description of the rule |
| `tag` | Tag identifier applied when the rule matches |
| `parameters` | List of parameter names from `parameters.json` to include in rule environment |
| `metrics` | List of metrics required for evaluation (must be present in job data) |
| `requirements` | Boolean expressions that must all be true for the rule to be evaluated |
| `variables` | Named expressions computed before evaluating the main rule |
| `rule` | Boolean expression that determines if the job matches this classification |
| `hint` | Go template string for generating a user-visible message |
### Expression Environment
Expressions in `requirements`, `variables`, and `rule` have access to:
**Job Properties:**
- `job.shared` - Shared node allocation type
- `job.duration` - Job runtime in seconds
- `job.numCores` - Number of CPU cores
- `job.numNodes` - Number of nodes
- `job.jobState` - Job completion state
- `job.numAcc` - Number of accelerators
- `job.smt` - SMT setting
**Metric Statistics (for each metric in `metrics`):**
- `<metric>.min` - Minimum value
- `<metric>.max` - Maximum value
- `<metric>.avg` - Average value
- `<metric>.limits.peak` - Peak limit from cluster config
- `<metric>.limits.normal` - Normal threshold
- `<metric>.limits.caution` - Caution threshold
- `<metric>.limits.alert` - Alert threshold
**Parameters:**
- All parameters listed in the `parameters` field
**Variables:**
- All variables defined in the `variables` array
### Expression Language
Rules use the [expr](https://github.com/expr-lang/expr) language for expressions. Supported operations:
- **Arithmetic**: `+`, `-`, `*`, `/`, `%`, `^`
- **Comparison**: `==`, `!=`, `<`, `<=`, `>`, `>=`
- **Logical**: `&&`, `||`, `!`
- **Functions**: Standard math functions (see expr documentation)
### Hint Templates
Hints use Go's `text/template` syntax. Variables from the evaluation environment are accessible:
```
{{.flops_any.avg}} # Access metric average
{{.job.duration}} # Access job property
{{.my_variable}} # Access computed variable
```
### Adding New Classification Rules
1. Create a new JSON file in `var/tagger/jobclasses/` (e.g., `memoryLeak.json`)
2. Define the rule structure:
```json
{
"name": "Memory Leak Detection",
"tag": "memory_leak",
"parameters": ["memory_leak_slope_threshold"],
"metrics": ["mem_used"],
"requirements": ["job.duration > 3600"],
"variables": [
{
"name": "mem_growth",
"expr": "(mem_used.max - mem_used.min) / job.duration"
}
],
"rule": "mem_growth > memory_leak_slope_threshold",
"hint": "Memory usage grew by {{.mem_growth}} per second"
}
```
3. Add any new parameters to `parameters.json`
4. The file is automatically detected and loaded
## Configuration Paths
The tagger system reads from these paths (relative to cc-backend working directory):
- **Application patterns**: `./var/tagger/apps/`
- **Job classification rules**: `./var/tagger/jobclasses/`
These paths are defined as constants in the source code and cannot be changed without recompiling.
## Troubleshooting
### Tags Not Applied
1. **Check tagging is enabled**: Verify `enable-job-taggers: true` is set in `config.json`
2. **Check configuration exists**:
```bash
ls -la var/tagger/apps
ls -la var/tagger/jobclasses
```
3. **Check logs for errors**:
```bash
./cc-backend -server -loglevel debug
```
4. **Verify file permissions**: Ensure cc-backend can read the configuration files
5. **For existing jobs**: Use `./cc-backend -apply-tags` to retroactively tag jobs
### Rules Not Matching
1. **Enable debug logging**: Set `loglevel: debug` to see detailed rule evaluation
2. **Check requirements**: Ensure all requirements in the rule are satisfied
3. **Verify metrics exist**: Classification rules require job metrics to be available
4. **Check metric names**: Ensure metric names match those in your cluster configuration
### File Watch Not Working
If changes to configuration files aren't detected:
1. Restart cc-backend to reload all configuration
2. Check filesystem supports file watching (network filesystems may not)
3. Check logs for file watch setup messages
## Best Practices
1. **Start Simple**: Begin with basic rules and refine based on results
2. **Use Requirements**: Filter out irrelevant jobs early with requirements
3. **Test Incrementally**: Add one rule at a time and verify behavior
4. **Document Rules**: Use descriptive names and clear hint messages
5. **Share Parameters**: Define common thresholds in `parameters.json` for consistency
6. **Version Control**: Keep your `var/tagger/` configuration in version control
7. **Backup Before Changes**: Test new rules on a copy before deploying to production
## Examples
### Simple Application Detection
**File: `var/tagger/apps/python.txt`**
```
python
python3
\.py
```
This detects jobs running Python scripts.
### Complex Classification Rule
**File: `var/tagger/jobclasses/cpuImbalance.json`**
```json
{
"name": "CPU Load Imbalance",
"tag": "cpu_imbalance",
"parameters": ["core_load_imbalance_threshold_factor"],
"metrics": ["cpu_load"],
"requirements": ["job.numCores > 1", "job.duration > 600"],
"variables": [
{
"name": "load_variance",
"expr": "(cpu_load.max - cpu_load.min) / cpu_load.avg"
}
],
"rule": "load_variance > core_load_imbalance_threshold_factor",
"hint": "CPU load varies by {{printf \"%.1f%%\" (load_variance * 100)}} across cores"
}
```
This detects jobs where CPU load is unevenly distributed across cores.
## Reference
### Configuration Options
**Main Configuration (`config.json`)**:
- `enable-job-taggers` (boolean, default: `false`) - Enables automatic job tagging system
- Must be set to `true` to activate automatic tagging on job start/stop events
- Does not affect the `-apply-tags` command line option
**Command Line Options**:
- `-apply-tags` - Apply all tagging rules to existing jobs in the database
- Works independently of `enable-job-taggers` configuration
- Useful for retroactively tagging jobs or re-evaluating with updated rules
### Default Configuration Location
The example configurations are provided in:
- `configs/tagger/apps/` - Example application patterns (16 applications)
- `configs/tagger/jobclasses/` - Example classification rules (3 rules)
Copy these to `var/tagger/` and customize for your environment.
### Tag Types
- `app` - Application tags (e.g., "vasp", "gromacs")
- `jobClass` - Classification tags (e.g., "lowutilization", "highload")
Tags can be queried and filtered in the ClusterCockpit UI and API.

View File

@@ -0,0 +1 @@
alf

View File

@@ -0,0 +1,7 @@
calc_rate
qmdffgen
dynamic
evbopt
explore
black_box
poly_qmdff

View File

@@ -0,0 +1,3 @@
chroma
qdp
qmp

View File

@@ -0,0 +1 @@
cp2k

View File

@@ -0,0 +1 @@
cpmd

View File

@@ -0,0 +1 @@
flame

View File

@@ -0,0 +1,3 @@
gromacs
gmx
mdrun

View File

@@ -0,0 +1 @@
julia

View File

@@ -0,0 +1 @@
lmp

View File

@@ -0,0 +1 @@
matlab

View File

@@ -0,0 +1 @@
openfoam

View File

@@ -0,0 +1 @@
orca

View File

@@ -0,0 +1,4 @@
python
pip
anaconda
conda

View File

@@ -0,0 +1,2 @@
starccm+
-podkey

View File

@@ -0,0 +1,10 @@
dscf
grad
ridft
rdgrad
ricc2
statpt
aoforce
escf
egrad
odft

View File

@@ -0,0 +1,3 @@
vasp_gam
vasp_ncl
vasp_std

View File

@@ -0,0 +1,21 @@
{
"name": "High memory usage",
"tag": "highmemory",
"parameters": [
"highmemoryusage_threshold_factor",
"job_min_duration_seconds"
],
"metrics": ["mem_used"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "memory_usage_pct",
"expr": "mem_used.max / mem_used.limits.peak * 100.0"
}
],
"rule": "mem_used.max > memory_used.limits.alert",
"hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions."
}

View File

@@ -0,0 +1,21 @@
{
"name": "Excessive CPU load",
"tag": "excessiveload",
"parameters": [
"excessivecpuload_threshold_factor",
"job_min_duration_seconds"
],
"metrics": ["cpu_load"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "load_threshold",
"expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor"
}
],
"rule": "cpu_load.avg > load_threshold",
"hint": "This job was detected as having excessive CPU load: average cpu load {{.cpu_load.avg}} exceeds the oversubscription threshold {{.load_threshold}} ({{.excessivecpuload_threshold_factor}} \u00d7 {{.cpu_load.limits.peak}} peak cores), indicating CPU contention."
}

View File

@@ -0,0 +1,22 @@
{
"name": "Low resource utilization",
"tag": "lowutilization",
"parameters": ["job_min_duration_seconds"],
"metrics": ["flops_any", "mem_bw"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "mem_bw_pct",
"expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
},
{
"name": "flops_any_pct",
"expr": "flops_any.avg / flops_any.limits.peak * 100.0"
}
],
"rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert",
"hint": "This job shows low resource utilization: FLOP rate {{.flops_any.avg}} GF/s ({{.flops_any_pct}}% of peak) and memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) are both below their alert thresholds."
}

View File

@@ -0,0 +1,18 @@
{
"name": "Low CPU load",
"tag": "lowload",
"parameters": ["lowcpuload_threshold_factor", "job_min_duration_seconds"],
"metrics": ["cpu_load"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "load_threshold",
"expr": "cpu_load.limits.peak * lowcpuload_threshold_factor"
}
],
"rule": "cpu_load.avg < load_threshold",
"hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}})."
}

View File

@@ -0,0 +1,22 @@
{
"name": "Memory bandwidth bound",
"tag": "memorybound",
"parameters": ["membound_bw_threshold_factor", "job_min_duration_seconds"],
"metrics": ["mem_bw"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "mem_bw_threshold",
"expr": "mem_bw.limits.peak * membound_bw_threshold_factor"
},
{
"name": "mem_bw_pct",
"expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
}
],
"rule": "mem_bw.avg > mem_bw_threshold",
"hint": "This job is memory bandwidth bound: memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) is within {{.membound_bw_threshold_factor}} of peak bandwidth. Consider improving data reuse or compute intensity."
}

View File

@@ -0,0 +1,15 @@
{
"lowcpuload_threshold_factor": 0.85,
"excessivecpuload_threshold_factor": 1.2,
"highmemoryusage_threshold_factor": 0.9,
"node_load_imbalance_threshold_factor": 0.1,
"core_load_imbalance_threshold_factor": 0.1,
"high_memory_load_threshold_factor": 0.9,
"lowgpuload_threshold_factor": 0.7,
"membound_bw_threshold_factor": 0.8,
"memory_leak_slope_threshold": 0.1,
"job_min_duration_seconds": 600.0,
"sampling_interval_seconds": 30.0,
"cpu_load_pre_cutoff_samples": 11.0,
"cpu_load_core_pre_cutoff_samples": 6.0
}

45
configs/uiConfig.json Normal file
View File

@@ -0,0 +1,45 @@
{
"job-list": {
"use-paging": false,
"show-footprint":false
},
"job-view": {
"show-polar-plot": true,
"show-footprint": true,
"show-roofline": true,
"show-stat-table": true
},
"metric-config": {
"job-list-metrics": ["mem_bw", "flops_dp"],
"job-view-plot-metrics": ["mem_bw", "flops_dp"],
"job-view-table-metrics": ["mem_bw", "flops_dp"],
"clusters": [
{
"name": "test",
"sub-clusters": [
{
"name": "one",
"job-list-metrics": ["mem_used", "flops_sp"]
}
]
}
]
},
"node-list": {
"use-paging": true
},
"plot-configuration": {
"plots-per-row": 3,
"color-background": true,
"line-width": 3,
"color-scheme": [
"#00bfff",
"#0000ff",
"#ff00ff",
"#ff0000",
"#ff8000",
"#ffff00",
"#80ff00"
]
}
}

170
go.mod
View File

@@ -1,90 +1,126 @@
module github.com/ClusterCockpit/cc-backend
go 1.23.5
go 1.25.0
require (
github.com/99designs/gqlgen v0.17.66
github.com/ClusterCockpit/cc-units v0.4.0
github.com/Masterminds/squirrel v1.5.4
github.com/coreos/go-oidc/v3 v3.12.0
github.com/go-co-op/gocron/v2 v2.16.0
github.com/go-ldap/ldap/v3 v3.4.10
github.com/go-sql-driver/mysql v1.9.0
github.com/golang-jwt/jwt/v5 v5.2.1
github.com/golang-migrate/migrate/v4 v4.18.2
github.com/google/gops v0.3.28
github.com/gorilla/handlers v1.5.2
github.com/gorilla/mux v1.8.1
github.com/gorilla/sessions v1.4.0
github.com/influxdata/influxdb-client-go/v2 v2.14.0
github.com/jmoiron/sqlx v1.4.0
github.com/mattn/go-sqlite3 v1.14.24
github.com/prometheus/client_golang v1.21.0
github.com/prometheus/common v0.62.0
github.com/qustavo/sqlhooks/v2 v2.1.0
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
github.com/swaggo/http-swagger v1.3.4
github.com/swaggo/swag v1.16.4
github.com/vektah/gqlparser/v2 v2.5.22
golang.org/x/crypto v0.35.0
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
golang.org/x/oauth2 v0.27.0
golang.org/x/time v0.5.0
tool (
github.com/99designs/gqlgen
github.com/swaggo/swag/cmd/swag
)
require (
filippo.io/edwards25519 v1.1.0 // indirect
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
github.com/99designs/gqlgen v0.17.86
github.com/ClusterCockpit/cc-lib/v2 v2.6.0
github.com/Masterminds/squirrel v1.5.4
github.com/aws/aws-sdk-go-v2 v1.41.1
github.com/aws/aws-sdk-go-v2/config v1.32.8
github.com/aws/aws-sdk-go-v2/credentials v1.19.8
github.com/aws/aws-sdk-go-v2/service/s3 v1.96.0
github.com/coreos/go-oidc/v3 v3.17.0
github.com/expr-lang/expr v1.17.8
github.com/go-chi/chi/v5 v5.2.5
github.com/go-chi/cors v1.2.2
github.com/go-co-op/gocron/v2 v2.19.1
github.com/go-ldap/ldap/v3 v3.4.12
github.com/golang-jwt/jwt/v5 v5.3.1
github.com/golang-migrate/migrate/v4 v4.19.1
github.com/google/gops v0.3.29
github.com/gorilla/sessions v1.4.0
github.com/influxdata/line-protocol/v2 v2.2.1
github.com/jmoiron/sqlx v1.4.0
github.com/joho/godotenv v1.5.1
github.com/linkedin/goavro/v2 v2.15.0
github.com/mattn/go-sqlite3 v1.14.34
github.com/parquet-go/parquet-go v0.27.0
github.com/qustavo/sqlhooks/v2 v2.1.0
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
github.com/stretchr/testify v1.11.1
github.com/swaggo/http-swagger v1.3.4
github.com/swaggo/swag v1.16.6
github.com/vektah/gqlparser/v2 v2.5.31
golang.org/x/crypto v0.48.0
golang.org/x/oauth2 v0.35.0
golang.org/x/time v0.14.0
)
require (
github.com/Azure/go-ntlmssp v0.1.0 // indirect
github.com/KyleBanks/depth v1.2.1 // indirect
github.com/agnivade/levenshtein v1.2.1 // indirect
github.com/andybalholm/brotli v1.2.0 // indirect
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect
github.com/go-jose/go-jose/v4 v4.0.5 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/spec v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/go-viper/mapstructure/v2 v2.2.1 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.17 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.8 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.17 // indirect
github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.14 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 // indirect
github.com/aws/smithy-go v1.24.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/fsnotify/fsnotify v1.9.0 // indirect
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 // indirect
github.com/go-jose/go-jose/v4 v4.1.3 // indirect
github.com/go-openapi/jsonpointer v0.22.4 // indirect
github.com/go-openapi/jsonreference v0.21.4 // indirect
github.com/go-openapi/spec v0.22.3 // indirect
github.com/go-openapi/swag/conv v0.25.4 // indirect
github.com/go-openapi/swag/jsonname v0.25.4 // indirect
github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
github.com/go-openapi/swag/loading v0.25.4 // indirect
github.com/go-openapi/swag/stringutils v0.25.4 // indirect
github.com/go-openapi/swag/typeutils v0.25.4 // indirect
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
github.com/go-viper/mapstructure/v2 v2.5.0 // indirect
github.com/goccy/go-yaml v1.19.2 // indirect
github.com/golang/snappy v1.0.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/securecookie v1.1.2 // indirect
github.com/gorilla/websocket v1.5.3 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
github.com/jonboulle/clockwork v0.5.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.18.4 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/nats-io/nats.go v1.48.0 // indirect
github.com/nats-io/nkeys v0.4.15 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/oapi-codegen/runtime v1.1.2 // indirect
github.com/parquet-go/bitpack v1.0.0 // indirect
github.com/parquet-go/jsonlite v1.4.0 // indirect
github.com/pierrec/lz4/v4 v4.1.25 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/rogpeppe/go-internal v1.10.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sosodev/duration v1.3.1 // indirect
github.com/stmcginnis/gofish v0.21.1 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/swaggo/files v1.0.1 // indirect
github.com/urfave/cli/v2 v2.27.5 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
go.uber.org/atomic v1.11.0 // indirect
golang.org/x/mod v0.23.0 // indirect
golang.org/x/net v0.35.0 // indirect
golang.org/x/sync v0.11.0 // indirect
golang.org/x/sys v0.30.0 // indirect
golang.org/x/text v0.22.0 // indirect
golang.org/x/tools v0.30.0 // indirect
google.golang.org/protobuf v1.36.5 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
github.com/twpayne/go-geom v1.6.1 // indirect
github.com/urfave/cli/v2 v2.27.7 // indirect
github.com/urfave/cli/v3 v3.6.1 // indirect
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a // indirect
golang.org/x/mod v0.33.0 // indirect
golang.org/x/net v0.50.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.41.0 // indirect
golang.org/x/text v0.34.0 // indirect
golang.org/x/tools v0.42.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)

459
go.sum
View File

@@ -1,128 +1,196 @@
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/99designs/gqlgen v0.17.66 h1:2/SRc+h3115fCOZeTtsqrB5R5gTGm+8qCAwcrZa+CXA=
github.com/99designs/gqlgen v0.17.66/go.mod h1:gucrb5jK5pgCKzAGuOMMVU9C8PnReecHEHd2UxLQwCg=
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8=
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU=
github.com/ClusterCockpit/cc-units v0.4.0 h1:zP5DOu99GmErW0tCDf0gcLrlWt42RQ9dpoONEOh4cI0=
github.com/ClusterCockpit/cc-units v0.4.0/go.mod h1:3S3PAhAayS3pbgcT4q9Vn9VJw22Op51X0YimtG77zBw=
github.com/99designs/gqlgen v0.17.86 h1:C8N3UTa5heXX6twl+b0AJyGkTwYL6dNmFrgZNLRcU6w=
github.com/99designs/gqlgen v0.17.86/go.mod h1:KTrPl+vHA1IUzNlh4EYkl7+tcErL3MgKnhHrBcV74Fw=
github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A=
github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk=
github.com/ClusterCockpit/cc-lib/v2 v2.5.1 h1:s6M9tyPDty+4zTdQGJYKpGJM9Nz7N6ITMdjPvNSLX5g=
github.com/ClusterCockpit/cc-lib/v2 v2.5.1/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0=
github.com/ClusterCockpit/cc-lib/v2 v2.6.0 h1:Q7zvRAVhfYA9PDB18pfY9A/6Ws4oWpnv8+P9MBRUDzg=
github.com/ClusterCockpit/cc-lib/v2 v2.6.0/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0=
github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM=
github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0=
github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U=
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM=
github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU=
github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa h1:LHTHcTQiSGT7VVbI0o4wBRNQIgn917usHWOd6VAffYI=
github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4=
github.com/alecthomas/assert/v2 v2.10.0 h1:jjRCHsj6hBJhkmhznrCzoNpbA3zqy0fYiUcYZP/GkPY=
github.com/alecthomas/assert/v2 v2.10.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k=
github.com/alecthomas/repr v0.4.0 h1:GhI2A8MACjfegCPVq9f1FLvIBS+DrQ2KQBFZP1iFzXc=
github.com/alecthomas/repr v0.4.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4=
github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI=
github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4=
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ=
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
github.com/aws/aws-sdk-go-v2 v1.41.1 h1:ABlyEARCDLN034NhxlRUSZr4l71mh+T5KAeGh6cerhU=
github.com/aws/aws-sdk-go-v2 v1.41.1/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 h1:489krEF9xIGkOaaX3CE/Be2uWjiXrkCH6gUX+bZA/BU=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4/go.mod h1:IOAPF6oT9KCsceNTvvYMNHy0+kMF8akOjeDvPENWxp4=
github.com/aws/aws-sdk-go-v2/config v1.32.8 h1:iu+64gwDKEoKnyTQskSku72dAwggKI5sV6rNvgSMpMs=
github.com/aws/aws-sdk-go-v2/config v1.32.8/go.mod h1:MI2XvA+qDi3i9AJxX1E2fu730syEBzp/jnXrjxuHwgI=
github.com/aws/aws-sdk-go-v2/credentials v1.19.8 h1:Jp2JYH1lRT3KhX4mshHPvVYsR5qqRec3hGvEarNYoR0=
github.com/aws/aws-sdk-go-v2/credentials v1.19.8/go.mod h1:fZG9tuvyVfxknv1rKibIz3DobRaFw1Poe8IKtXB3XYY=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 h1:I0GyV8wiYrP8XpA70g1HBcQO1JlQxCMTW9npl5UbDHY=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17/go.mod h1:tyw7BOl5bBe/oqvoIeECFJjMdzXoa/dfVz3QQ5lgHGA=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 h1:xOLELNKGp2vsiteLsvLPwxC+mYmO6OZ8PYgiuPJzF8U=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17/go.mod h1:5M5CI3D12dNOtH3/mk6minaRwI2/37ifCURZISxA/IQ=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 h1:WWLqlh79iO48yLkj1v3ISRNiv+3KdQoZ6JWyfcsyQik=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17/go.mod h1:EhG22vHRrvF8oXSTYStZhJc1aUgKtnJe+aOiFEV90cM=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.17 h1:JqcdRG//czea7Ppjb+g/n4o8i/R50aTBHkA7vu0lK+k=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.17/go.mod h1:CO+WeGmIdj/MlPel2KwID9Gt7CNq4M65HUfBW97liM0=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 h1:0ryTNEdJbzUCEWkVXEXoqlXV72J5keC1GvILMOuD00E=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4/go.mod h1:HQ4qwNZh32C3CBeO6iJLQlgtMzqeG17ziAA/3KDJFow=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.8 h1:Z5EiPIzXKewUQK0QTMkutjiaPVeVYXX7KIqhXu/0fXs=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.8/go.mod h1:FsTpJtvC4U1fyDXk7c71XoDv3HlRm8V3NiYLeYLh5YE=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 h1:RuNSMoozM8oXlgLG/n6WLaFGoea7/CddrCfIiSA+xdY=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17/go.mod h1:F2xxQ9TZz5gDWsclCtPQscGpP0VUOc8RqgFM3vDENmU=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.17 h1:bGeHBsGZx0Dvu/eJC0Lh9adJa3M1xREcndxLNZlve2U=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.17/go.mod h1:dcW24lbU0CzHusTE8LLHhRLI42ejmINN8Lcr22bwh/g=
github.com/aws/aws-sdk-go-v2/service/s3 v1.96.0 h1:oeu8VPlOre74lBA/PMhxa5vewaMIMmILM+RraSyB8KA=
github.com/aws/aws-sdk-go-v2/service/s3 v1.96.0/go.mod h1:5jggDlZ2CLQhwJBiZJb4vfk4f0GxWdEDruWKEJ1xOdo=
github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 h1:VrhDvQib/i0lxvr3zqlUwLwJP4fpmpyD9wYG1vfSu+Y=
github.com/aws/aws-sdk-go-v2/service/signin v1.0.5/go.mod h1:k029+U8SY30/3/ras4G/Fnv/b88N4mAfliNn08Dem4M=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 h1:v6EiMvhEYBoHABfbGB4alOYmCIrcgyPPiBE1wZAEbqk=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.9/go.mod h1:yifAsgBxgJWn3ggx70A3urX2AN49Y5sJTD1UQFlfqBw=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.14 h1:0jbJeuEHlwKJ9PfXtpSFc4MF+WIWORdhN1n30ITZGFM=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.14/go.mod h1:sTGThjphYE4Ohw8vJiRStAcu3rbjtXRsdNB0TvZ5wwo=
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 h1:5fFjR/ToSOzB2OQ/XqWpZBmNvmP/pJ1jOWYlFDJTjRQ=
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6/go.mod h1:qgFDZQSD/Kys7nJnVqYlWKnh0SSdMjAi0uSwON4wgYQ=
github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk=
github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/coreos/go-oidc/v3 v3.12.0 h1:sJk+8G2qq94rDI6ehZ71Bol3oUHy63qNYmkiSjrc/Jo=
github.com/coreos/go-oidc/v3 v3.12.0/go.mod h1:gE3LgjOgFoHi9a4ce4/tJczr0Ai2/BoDhf0r5lltWI0=
github.com/cpuguy83/go-md2man/v2 v2.0.6 h1:XJtiaUW6dEEqVuZiMTn1ldk455QWwEIsMIJlo5vtkx0=
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/coreos/go-oidc/v3 v3.17.0 h1:hWBGaQfbi0iVviX4ibC7bk8OKT5qNr4klBaCHVNvehc=
github.com/coreos/go-oidc/v3 v3.17.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8=
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo=
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
github.com/dhui/dktest v0.4.4 h1:+I4s6JRE1yGuqflzwqG+aIaMdgXIorCf5P98JnaAWa8=
github.com/dhui/dktest v0.4.4/go.mod h1:4+22R4lgsdAXrDyaH4Nqx2JEz2hLp49MqQmm9HLCQhM=
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/docker/docker v27.2.0+incompatible h1:Rk9nIVdfH3+Vz4cyI/uhbINhEZ/oLmc+CBXmH6fbNk4=
github.com/docker/docker v27.2.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c=
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/go-asn1-ber/asn1-ber v1.5.7 h1:DTX+lbVTWaTw1hQ+PbZPlnDZPEIs0SS/GCZAl535dDk=
github.com/go-asn1-ber/asn1-ber v1.5.7/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0=
github.com/go-co-op/gocron/v2 v2.16.0 h1:uqUF6WFZ4enRU45pWFNcn1xpDLc+jBOTKhPQI16Z1xs=
github.com/go-co-op/gocron/v2 v2.16.0/go.mod h1:opexeOFy5BplhsKdA7bzY9zeYih8I8/WNJ4arTIFPVc=
github.com/go-jose/go-jose/v4 v4.0.5 h1:M6T8+mKZl/+fNNuFHvGIzDz7BTLQPIounk/b9dw3AaE=
github.com/go-jose/go-jose/v4 v4.0.5/go.mod h1:s3P1lRrkT8igV8D9OjyL4WRyHvjB6a4JSllnOrmmBOA=
github.com/go-ldap/ldap/v3 v3.4.10 h1:ot/iwPOhfpNVgB1o+AVXljizWZ9JTp7YF5oeyONmcJU=
github.com/go-ldap/ldap/v3 v3.4.10/go.mod h1:JXh4Uxgi40P6E9rdsYqpUtbW46D9UTjJ9QSwGRznplY=
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4=
github.com/go-openapi/spec v0.21.0 h1:LTVzPc3p/RzRnkQqLRndbAzjY0d0BCL72A6j3CdL9ZY=
github.com/go-openapi/spec v0.21.0/go.mod h1:78u6VdPw81XU44qEWGhtr982gJ5BWg2c0I5XwVMotYk=
github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM=
github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 h1:BP4M0CvQ4S3TGls2FvczZtj5Re/2ZzkV9VwqPHH/3Bo=
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0=
github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug=
github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0=
github.com/go-chi/cors v1.2.2 h1:Jmey33TE+b+rB7fT8MUy1u0I4L+NARQlK6LhzKPSyQE=
github.com/go-chi/cors v1.2.2/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn836hqM7JxpglAy2Vzc58=
github.com/go-co-op/gocron/v2 v2.19.1 h1:B4iLeA0NB/2iO3EKQ7NfKn5KsQgZfjb2fkvoZJU3yBI=
github.com/go-co-op/gocron/v2 v2.19.1/go.mod h1:5lEiCKk1oVJV39Zg7/YG10OnaVrDAV5GGR6O0663k6U=
github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs=
github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08=
github.com/go-ldap/ldap/v3 v3.4.12 h1:1b81mv7MagXZ7+1r7cLTWmyuTqVqdwbtJSjC0DAp9s4=
github.com/go-ldap/ldap/v3 v3.4.12/go.mod h1:+SPAGcTtOfmGsCb3h1RFiq4xpp4N636G75OEace8lNo=
github.com/go-openapi/jsonpointer v0.22.4 h1:dZtK82WlNpVLDW2jlA1YCiVJFVqkED1MegOUy9kR5T4=
github.com/go-openapi/jsonpointer v0.22.4/go.mod h1:elX9+UgznpFhgBuaMQ7iu4lvvX1nvNsesQ3oxmYTw80=
github.com/go-openapi/jsonreference v0.21.4 h1:24qaE2y9bx/q3uRK/qN+TDwbok1NhbSmGjjySRCHtC8=
github.com/go-openapi/jsonreference v0.21.4/go.mod h1:rIENPTjDbLpzQmQWCj5kKj3ZlmEh+EFVbz3RTUh30/4=
github.com/go-openapi/spec v0.22.3 h1:qRSmj6Smz2rEBxMnLRBMeBWxbbOvuOoElvSvObIgwQc=
github.com/go-openapi/spec v0.22.3/go.mod h1:iIImLODL2loCh3Vnox8TY2YWYJZjMAKYyLH2Mu8lOZs=
github.com/go-openapi/swag v0.19.15 h1:D2NRCBzS9/pEY3gP9Nl8aDqGUcPFrwG2p+CNFrLyrCM=
github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4=
github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU=
github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI=
github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag=
github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA=
github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY=
github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo=
github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM=
github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s=
github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE=
github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8=
github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0=
github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw=
github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE=
github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw=
github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc=
github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4=
github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg=
github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls=
github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54=
github.com/go-sql-driver/mysql v1.4.1/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y=
github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
github.com/go-sql-driver/mysql v1.9.0 h1:Y0zIbQXhQKmQgTp44Y1dp3wTXcn804QoTptLZT1vtvo=
github.com/go-sql-driver/mysql v1.9.0/go.mod h1:pDetrLJeA3oMujJuvXc8RJoasr589B6A9fwzD3QMrqw=
github.com/go-viper/mapstructure/v2 v2.2.1 h1:ZAaOCxANMuZx5RCeg0mBdEZk7DZasvvZIxtHqx8aGss=
github.com/go-viper/mapstructure/v2 v2.2.1/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang-jwt/jwt/v5 v5.2.1 h1:OuVbFODueb089Lh128TAcimifWaLhJwVflnrgM17wHk=
github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
github.com/golang-migrate/migrate/v4 v4.18.2 h1:2VSCMz7x7mjyTXx3m2zPokOY82LTRgxK1yQYKo6wWQ8=
github.com/golang-migrate/migrate/v4 v4.18.2/go.mod h1:2CM6tJvn2kqPXwnXO/d3rAQYiyoIm180VsO8PRX6Rpk=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/go-viper/mapstructure/v2 v2.5.0 h1:vM5IJoUAy3d7zRSVtIwQgBj7BiWtMPfmPEgAXnvj1Ro=
github.com/go-viper/mapstructure/v2 v2.5.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
github.com/goccy/go-yaml v1.19.2 h1:PmFC1S6h8ljIz6gMRBopkjP1TVT7xuwrButHID66PoM=
github.com/goccy/go-yaml v1.19.2/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA=
github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY=
github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
github.com/golang-migrate/migrate/v4 v4.19.1 h1:OCyb44lFuQfYXYLx1SCxPZQGU7mcaZ7gH9yH4jSFbBA=
github.com/golang-migrate/migrate/v4 v4.19.1/go.mod h1:CTcgfjxhaUtsLipnLoQRWCrjYXycRz/g5+RWDuYgPrE=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark=
github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c=
github.com/google/gops v0.3.29 h1:n98J2qSOK1NJvRjdLDcjgDryjpIBGhbaqph1mXKL0rY=
github.com/google/gops v0.3.29/go.mod h1:8N3jZftuPazvUwtYY/ncG4iPrjp15ysNKLfq+QQPiwc=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE=
github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
github.com/gorilla/securecookie v1.1.1/go.mod h1:ra0sb63/xPlUeL+yeDciTfxMRAA+MP+HVt/4epWDjd4=
github.com/gorilla/securecookie v1.1.2 h1:YCIWL56dvtr73r6715mJs5ZvhtnY73hBvEF8kXD8ePA=
github.com/gorilla/securecookie v1.1.2/go.mod h1:NfCASbcHqRSY+3a8tlWJwsQap2VX5pwzwo4h3eOamfo=
github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/zI+bUmuGM=
github.com/gorilla/sessions v1.4.0 h1:kpIYOp/oi6MG/p5PgxApU8srsSw9tuFbt46Lt7auzqQ=
github.com/gorilla/sessions v1.4.0/go.mod h1:FLWm50oby91+hl7p/wRxDth9bWSuk0qVL2emc7lT5ik=
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I=
github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/go-uuid v1.0.2/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8=
github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjwJdUHnwvfjMF71M1iI4=
github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI=
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU=
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98=
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY=
github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY=
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8=
github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs=
github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo=
@@ -137,19 +205,18 @@ github.com/jcmturner/rpc/v2 v2.0.3 h1:7FXXj8Ti1IaVFpSAziCZWNzbNuZmnvw/i6CqLNdWfZ
github.com/jcmturner/rpc/v2 v2.0.3/go.mod h1:VUJYCIDm3PVOEHw8sgt091/20OJjskO/YJki3ELg/Hc=
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/jonboulle/clockwork v0.5.0 h1:Hyh9A8u51kptdkR+cqRpT1EebBwTn1oK9YfGYbdFz6I=
github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7XN3SzBPjZF60=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw=
@@ -159,52 +226,57 @@ github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6Fm
github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4=
github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
github.com/linkedin/goavro/v2 v2.15.0 h1:pDj1UrjUOO62iXhgBiE7jQkpNIc5/tA5eZsgolMjgVI=
github.com/linkedin/goavro/v2 v2.15.0/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk=
github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM=
github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
github.com/mattn/go-sqlite3 v1.14.34 h1:3NtcvcUnFBPsuRcno8pUtupspG/GM+9nZ88zgJcp6Zk=
github.com/mattn/go-sqlite3 v1.14.34/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U=
github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/oapi-codegen/runtime v1.1.2 h1:P2+CubHq8fO4Q6fV1tqDBZHCwpVpvPg7oKiYzQgXIyI=
github.com/oapi-codegen/runtime v1.1.2/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA=
github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs=
github.com/parquet-go/jsonlite v1.4.0 h1:RTG7prqfO0HD5egejU8MUDBN8oToMj55cgSV1I0zNW4=
github.com/parquet-go/jsonlite v1.4.0/go.mod h1:nDjpkpL4EOtqs6NQugUsi0Rleq9sW/OtC1NnZEnxzF0=
github.com/parquet-go/parquet-go v0.27.0 h1:vHWK2xaHbj+v1DYps03yDRpEsdtOeKbhiXUaixoPb3g=
github.com/parquet-go/parquet-go v0.27.0/go.mod h1:navtkAYr2LGoJVp141oXPlO/sxLvaOe3la2JEoD8+rg=
github.com/pierrec/lz4/v4 v4.1.25 h1:kocOqRffaIbU5djlIBr7Wh+cx82C0vtFb0fOurZHqD0=
github.com/pierrec/lz4/v4 v4.1.25/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.21.0 h1:DIsaGmiaBkSangBgMtWdNfxbMNdku5IK6iNhrEqWvdA=
github.com/prometheus/client_golang v1.21.0/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io=
github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4=
github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw=
github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws=
github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw=
github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAiCZi0=
github.com/qustavo/sqlhooks/v2 v2.1.0/go.mod h1:aMREyKo7fOKTwiLuWPsaHRXEmtqG4yREztO0idF83AU=
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
@@ -214,138 +286,101 @@ github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NF
github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4=
github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg=
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
github.com/stmcginnis/gofish v0.21.1 h1:sutDvBhmLh4RDOZ1DN8GUyYRu7f1ggvKMMnSaiqhwn4=
github.com/stmcginnis/gofish v0.21.1/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE=
github.com/swaggo/files v1.0.1/go.mod h1:0qXmMNH6sXNf+73t65aKeB+ApmgxdnkQzVTAj2uaMUg=
github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64a5ww=
github.com/swaggo/http-swagger v1.3.4/go.mod h1:9dAh0unqMBAlbp1uE2Uc2mQTxNMU/ha4UbucIg1MFkQ=
github.com/swaggo/swag v1.16.4 h1:clWJtd9LStiG3VeijiCfOVODP6VpHtKdQy9ELFG3s1A=
github.com/swaggo/swag v1.16.4/go.mod h1:VBsHJRsDvfYvqoiMKnsdwhNV9LEMHgEDZcyVYX0sxPg=
github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w=
github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ=
github.com/vektah/gqlparser/v2 v2.5.22 h1:yaaeJ0fu+nv1vUMW0Hl+aS1eiv1vMfapBNjpffAda1I=
github.com/vektah/gqlparser/v2 v2.5.22/go.mod h1:xMl+ta8a5M1Yo1A1Iwt/k7gSpscwSnHZdw7tfhEGfTM=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
github.com/swaggo/swag v1.16.6 h1:qBNcx53ZaX+M5dxVyTrgQ0PJ/ACK+NzhwcbieTt+9yI=
github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4Xesg=
github.com/twpayne/go-geom v1.6.1 h1:iLE+Opv0Ihm/ABIcvQFGIiFBXd76oBIar9drAwHFhR4=
github.com/twpayne/go-geom v1.6.1/go.mod h1:Kr+Nly6BswFsKM5sd31YaoWS5PeDDH2NftJTK7Gd028=
github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU=
github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4=
github.com/urfave/cli/v3 v3.6.1 h1:j8Qq8NyUawj/7rTYdBGrxcH7A/j7/G8Q5LhWEW4G3Mo=
github.com/urfave/cli/v3 v3.6.1/go.mod h1:ysVLtOEmg2tOy6PknnYVhDoouyC/6N42TMeoMzskhso=
github.com/vektah/gqlparser/v2 v2.5.31 h1:YhWGA1mfTjID7qJhd1+Vxhpk5HTgydrGU9IgkWBTJ7k=
github.com/vektah/gqlparser/v2 v2.5.31/go.mod h1:c1I28gSOVNzlfc4WuDlqU7voQnsqI6OG2amkBAFmgts=
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg=
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 h1:TT4fX+nBOA/+LUkobKGW1ydGcn+G3vRw9+g5HwCphpk=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0/go.mod h1:L7UH0GbB0p47T4Rri3uHjbpCFYrVrwc1I25QhNPiGK8=
go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw=
go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8=
go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc=
go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8=
go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4=
go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ=
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58=
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/crypto v0.35.0 h1:b15kiHdrGCHrP6LvwaQ3c03kgNhhiMgvlhxHQhmg2Xs=
golang.org/x/crypto v0.35.0/go.mod h1:dy7dXNW32cAb/6/PRuTNsix8T+vJAqvuIy5Bli/x0YQ=
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa h1:t2QcU6V556bFjYgu4L6C+6VrCPyJZ+eyRsABUPs1mz4=
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa/go.mod h1:BHOTPb3L19zxehTsLoJXVaTktb06DFgmdW6Wb9s8jqk=
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a h1:ovFr6Z0MNmU7nH8VaX5xqw+05ST2uO1exVfZPVqRC5o=
golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM=
golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M=
golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8=
golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60=
golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM=
golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY=
golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY=
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM=
google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=

View File

@@ -32,6 +32,7 @@ resolver:
autobind:
- "github.com/99designs/gqlgen/graphql/introspection"
- "github.com/ClusterCockpit/cc-backend/internal/graph/model"
- "github.com/ClusterCockpit/cc-backend/internal/config"
# This section declares type mapping between the GraphQL and go type systems
#
@@ -51,61 +52,51 @@ models:
- github.com/99designs/gqlgen/graphql.Int64
- github.com/99designs/gqlgen/graphql.Int32
Job:
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Job"
model: "github.com/ClusterCockpit/cc-lib/v2/schema.Job"
fields:
tags:
resolver: true
metaData:
resolver: true
Cluster:
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Cluster"
model: "github.com/ClusterCockpit/cc-lib/v2/schema.Cluster"
fields:
partitions:
resolver: true
NullableFloat:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
MetricScope:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
MetricValue:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
# Node:
# model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node"
# fields:
# metaData:
# resolver: true
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricScope" }
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricValue" }
JobStatistics:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobStatistics" }
GlobalMetricListItem:
{
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.GlobalMetricListItem",
}
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.GlobalMetricListItem" }
ClusterSupport:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.ClusterSupport" }
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
Resource:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
JobState:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
TimeRange:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
IntRange:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.IntRange" }
JobMetric:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Series" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.ClusterSupport" }
Tag: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Resource" }
JobState: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobState" }
Node: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node" }
SchedulerState:
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.SchedulerState" }
HealthState:
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MonitoringState" }
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Series" }
MetricStatistics:
{
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricStatistics",
}
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricStatistics" }
MetricConfig:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricConfig" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricConfig" }
SubClusterConfig:
{
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubClusterConfig",
}
Accelerator:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Accelerator" }
Topology:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Topology" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubClusterConfig" }
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Accelerator" }
Topology: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Topology" }
FilterRanges:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
SubCluster:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
StatsSeries:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.FilterRanges" }
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubCluster" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Unit" }

View File

@@ -3,7 +3,7 @@ Description=ClusterCockpit Web Server
Documentation=https://github.com/ClusterCockpit/cc-backend
Wants=network-online.target
After=network-online.target
After=mariadb.service mysql.service
# Database is file-based SQLite - no service dependency required
[Service]
WorkingDirectory=/opt/monitoring/cc-backend
@@ -12,7 +12,7 @@ NotifyAccess=all
Restart=on-failure
RestartSec=30
TimeoutStopSec=100
ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json
ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json --server
[Install]
WantedBy=multi-user.target

View File

@@ -1,5 +1,5 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api_test
@@ -23,41 +23,47 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/gorilla/mux"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-chi/chi/v5"
_ "github.com/mattn/go-sqlite3"
)
func setup(t *testing.T) *api.RestApi {
func setup(t *testing.T) *api.RestAPI {
repository.ResetConnection()
const testconfig = `{
"addr": "0.0.0.0:8080",
"validate": false,
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"jwts": {
"max-age": "2m"
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"api-allowed-ips": [
"*"
]
},
"metric-store": {
"checkpoints": {
"interval": "12h"
},
"clusters": [
{
"name": "testcluster",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
}
]
"retention-in-memory": "48h",
"memory-cap": 100
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2m"
}
}
}`
const testclusterJson = `{
const testclusterJSON = `{
"name": "testcluster",
"subClusters": [
{
@@ -113,58 +119,73 @@ func setup(t *testing.T) *api.RestApi {
]
}`
log.Init("info", true)
cclog.Init("info", true)
tmpdir := t.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
if err := os.Mkdir(jobarchive, 0777); err != nil {
if err := os.Mkdir(jobarchive, 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil {
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
t.Fatal(err)
}
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0777); err != nil {
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJson), 0666); err != nil {
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
t.Fatal(err)
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB("sqlite3", dbfilepath)
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
cfgFilePath := filepath.Join(tmpdir, "config.json")
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0666); err != nil {
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
t.Fatal(err)
}
config.Init(cfgFilePath)
ccconf.Init(cfgFilePath)
metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{}
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
config.Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
repository.Connect("sqlite3", dbfilepath)
repository.Connect(dbfilepath)
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
t.Fatal(err)
}
if err := metricdata.Init(); err != nil {
t.Fatal(err)
// metricstore initialization removed - it's initialized via callback in tests
archiver.Start(repository.GetJobRepository(), context.Background())
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
auth.Init(&cfg)
} else {
cclog.Warn("Authentication disabled due to missing configuration")
auth.Init(nil)
}
archiver.Start(repository.GetJobRepository())
auth.Init()
graph.Init()
return api.New()
}
func cleanup() {
// TODO: Clear all caches, reset all modules, etc...
if err := archiver.Shutdown(5 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
}
}
/*
@@ -191,21 +212,19 @@ func TestRestApi(t *testing.T) {
},
}
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
r := mux.NewRouter()
r.PathPrefix("/api").Subrouter()
r.StrictSlash(true)
restapi.MountApiRoutes(r)
r := chi.NewRouter()
restapi.MountAPIRoutes(r)
var TestJobId int64 = 123
var TestClusterName string = "testcluster"
var TestJobID int64 = 123
TestClusterName := "testcluster"
var TestStartTime int64 = 123456789
const startJobBody string = `{
"jobId": 123,
"jobId": 123,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
@@ -215,10 +234,9 @@ func TestRestApi(t *testing.T) {
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"exclusive": 1,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"tags": [{ "type": "testTagType", "name": "testTagName", "scope": "testuser" }],
"resources": [
{
"hostname": "host123",
@@ -249,16 +267,17 @@ func TestRestApi(t *testing.T) {
if response.StatusCode != http.StatusCreated {
t.Fatal(response.Status, recorder.Body.String())
}
resolver := graph.GetResolverInstance()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
// resolver := graph.GetResolverInstance()
restapi.JobRepository.SyncJobs()
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
job.Tags, err = resolver.Job().Tags(ctx, job)
if err != nil {
t.Fatal(err)
}
// job.Tags, err = resolver.Job().Tags(ctx, job)
// if err != nil {
// t.Fatal(err)
// }
if job.JobID != 123 ||
job.User != "testuser" ||
@@ -267,21 +286,20 @@ func TestRestApi(t *testing.T) {
job.SubCluster != "sc1" ||
job.Partition != "default" ||
job.Walltime != 3600 ||
job.ArrayJobId != 0 ||
job.ArrayJobID != 0 ||
job.NumNodes != 1 ||
job.NumHWThreads != 8 ||
job.NumAcc != 0 ||
job.Exclusive != 1 ||
job.MonitoringStatus != 1 ||
job.SMT != 1 ||
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
job.StartTime.Unix() != 123456789 {
job.StartTime != 123456789 {
t.Fatalf("unexpected job properties: %#v", job)
}
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" {
t.Fatalf("unexpected tags: %#v", job.Tags)
}
// if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" {
// t.Fatalf("unexpected tags: %#v", job.Tags)
// }
}); !ok {
return
}
@@ -308,8 +326,8 @@ func TestRestApi(t *testing.T) {
t.Fatal(response.Status, recorder.Body.String())
}
archiver.WaitForArchiving()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
// Archiving happens asynchronously, will be completed in cleanup
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
@@ -337,7 +355,7 @@ func TestRestApi(t *testing.T) {
}
t.Run("CheckArchive", func(t *testing.T) {
data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
data, err := metricdispatch.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
if err != nil {
t.Fatal(err)
}
@@ -349,7 +367,7 @@ func TestRestApi(t *testing.T) {
t.Run("CheckDoubleStart", func(t *testing.T) {
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!
body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1)
body := strings.ReplaceAll(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`)
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body)))
recorder := httptest.NewRecorder()
@@ -371,7 +389,7 @@ func TestRestApi(t *testing.T) {
"partition": "default",
"walltime": 3600,
"numNodes": 1,
"exclusive": 1,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
@@ -399,6 +417,7 @@ func TestRestApi(t *testing.T) {
}
time.Sleep(1 * time.Second)
restapi.JobRepository.SyncJobs()
const stopJobBodyFailed string = `{
"jobId": 12345,
@@ -420,7 +439,7 @@ func TestRestApi(t *testing.T) {
t.Fatal(response.Status, recorder.Body.String())
}
archiver.WaitForArchiving()
// Archiving happens asynchronously, will be completed in cleanup
jobid, cluster := int64(12345), "testcluster"
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
if err != nil {
@@ -434,4 +453,198 @@ func TestRestApi(t *testing.T) {
if !ok {
t.Fatal("subtest failed")
}
t.Run("GetUsedNodesNoRunning", func(t *testing.T) {
contextUserValue := &schema.User{
Username: "testuser",
Projects: make([]string, 0),
Roles: []string{"api"},
AuthType: 0,
AuthSource: 2,
}
req := httptest.NewRequest(http.MethodGet, "/jobs/used_nodes?ts=123456790", nil)
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusOK {
t.Fatal(response.Status, recorder.Body.String())
}
var result api.GetUsedNodesAPIResponse
if err := json.NewDecoder(response.Body).Decode(&result); err != nil {
t.Fatal(err)
}
if result.UsedNodes == nil {
t.Fatal("expected usedNodes to be non-nil")
}
if len(result.UsedNodes) != 0 {
t.Fatalf("expected no used nodes for stopped jobs, got: %v", result.UsedNodes)
}
})
}
// TestStopJobWithReusedJobId verifies that stopping a recently started job works
// even when an older job with the same jobId exists in the job table (e.g. with
// state "failed"). This is a regression test for the bug where Find() on the job
// table would match the old job instead of the new one still in job_cache.
func TestStopJobWithReusedJobId(t *testing.T) {
restapi := setup(t)
t.Cleanup(cleanup)
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
},
},
}
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
r := chi.NewRouter()
restapi.MountAPIRoutes(r)
const contextUserKey repository.ContextKey = "user"
contextUserValue := &schema.User{
Username: "testuser",
Projects: make([]string, 0),
Roles: []string{"user"},
AuthType: 0,
AuthSource: 2,
}
// Step 1: Start the first job (jobId=999)
const startJobBody1 string = `{
"jobId": 999,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "default",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
"startTime": 200000000
}`
if ok := t.Run("StartFirstJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody1)))
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
if recorder.Result().StatusCode != http.StatusCreated {
t.Fatal(recorder.Result().Status, recorder.Body.String())
}
}); !ok {
return
}
// Step 2: Sync to move job from cache to job table, then stop it as "failed"
time.Sleep(1 * time.Second)
restapi.JobRepository.SyncJobs()
const stopJobBody1 string = `{
"jobId": 999,
"startTime": 200000000,
"cluster": "testcluster",
"jobState": "failed",
"stopTime": 200001000
}`
if ok := t.Run("StopFirstJobAsFailed", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody1)))
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
if recorder.Result().StatusCode != http.StatusOK {
t.Fatal(recorder.Result().Status, recorder.Body.String())
}
jobid, cluster := int64(999), "testcluster"
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
if err != nil {
t.Fatal(err)
}
if job.State != schema.JobStateFailed {
t.Fatalf("expected first job to be failed, got: %s", job.State)
}
}); !ok {
return
}
// Wait for archiving to complete
time.Sleep(1 * time.Second)
// Step 3: Start a NEW job with the same jobId=999 but different startTime.
// This job will sit in job_cache (not yet synced).
const startJobBody2 string = `{
"jobId": 999,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "default",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
"startTime": 300000000
}`
if ok := t.Run("StartSecondJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody2)))
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
if recorder.Result().StatusCode != http.StatusCreated {
t.Fatal(recorder.Result().Status, recorder.Body.String())
}
}); !ok {
return
}
// Step 4: Stop the second job WITHOUT syncing first.
// Before the fix, this would fail because Find() on the job table would
// match the old failed job (jobId=999) and reject with "already stopped".
const stopJobBody2 string = `{
"jobId": 999,
"startTime": 300000000,
"cluster": "testcluster",
"jobState": "completed",
"stopTime": 300001000
}`
t.Run("StopSecondJobBeforeSync", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody2)))
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
if recorder.Result().StatusCode != http.StatusOK {
t.Fatalf("expected stop to succeed for cached job, got: %s %s",
recorder.Result().Status, recorder.Body.String())
}
})
}

71
internal/api/cluster.go Normal file
View File

@@ -0,0 +1,71 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"encoding/json"
"fmt"
"net/http"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// GetClustersAPIResponse model
type GetClustersAPIResponse struct {
Clusters []*schema.Cluster `json:"clusters"` // Array of clusters
}
// getClusters godoc
// @summary Lists all cluster configs
// @tags Cluster query
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
// @produce json
// @param cluster query string false "Job Cluster"
// @success 200 {object} api.GetClustersAPIResponse "Array of clusters"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/clusters/ [get]
func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); user != nil &&
!user.HasRole(schema.RoleAPI) {
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
bw := bufio.NewWriter(rw)
defer bw.Flush()
var clusters []*schema.Cluster
if r.URL.Query().Has("cluster") {
name := r.URL.Query().Get("cluster")
cluster := archive.GetCluster(name)
if cluster == nil {
handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw)
return
}
clusters = append(clusters, cluster)
} else {
clusters = archive.Clusters
}
payload := GetClustersAPIResponse{
Clusters: clusters,
}
if err := json.NewEncoder(bw).Encode(payload); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
}

File diff suppressed because it is too large Load Diff

1154
internal/api/job.go Normal file

File diff suppressed because it is too large Load Diff

165
internal/api/log.go Normal file
View File

@@ -0,0 +1,165 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"encoding/json"
"fmt"
"net/http"
"os/exec"
"regexp"
"strconv"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type LogEntry struct {
Timestamp string `json:"timestamp"`
Priority int `json:"priority"`
Message string `json:"message"`
Unit string `json:"unit"`
}
var safePattern = regexp.MustCompile(`^[a-zA-Z0-9 :\-\.]+$`)
func (api *RestAPI) getJournalLog(rw http.ResponseWriter, r *http.Request) {
user := repository.GetUserFromContext(r.Context())
if !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to view logs"), http.StatusForbidden, rw)
return
}
since := r.URL.Query().Get("since")
if since == "" {
since = "1 hour ago"
}
if !safePattern.MatchString(since) {
handleError(fmt.Errorf("invalid 'since' parameter"), http.StatusBadRequest, rw)
return
}
lines := 200
if l := r.URL.Query().Get("lines"); l != "" {
n, err := strconv.Atoi(l)
if err != nil || n < 1 {
handleError(fmt.Errorf("invalid 'lines' parameter"), http.StatusBadRequest, rw)
return
}
if n > 1000 {
n = 1000
}
lines = n
}
unit := config.Keys.SystemdUnit
if unit == "" {
unit = "clustercockpit.service"
}
args := []string{
"--output=json",
"--no-pager",
"-n", fmt.Sprintf("%d", lines),
"--since", since,
"-u", unit,
}
if level := r.URL.Query().Get("level"); level != "" {
n, err := strconv.Atoi(level)
if err != nil || n < 0 || n > 7 {
handleError(fmt.Errorf("invalid 'level' parameter (must be 0-7)"), http.StatusBadRequest, rw)
return
}
args = append(args, "--priority", fmt.Sprintf("%d", n))
}
if search := r.URL.Query().Get("search"); search != "" {
if !safePattern.MatchString(search) {
handleError(fmt.Errorf("invalid 'search' parameter"), http.StatusBadRequest, rw)
return
}
args = append(args, "--grep", search)
}
cclog.Debugf("calling journalctl with %s", strings.Join(args, " "))
cmd := exec.CommandContext(r.Context(), "journalctl", args...)
stdout, err := cmd.StdoutPipe()
if err != nil {
handleError(fmt.Errorf("failed to create pipe: %w", err), http.StatusInternalServerError, rw)
return
}
if err := cmd.Start(); err != nil {
handleError(fmt.Errorf("failed to start journalctl: %w", err), http.StatusInternalServerError, rw)
return
}
entries := make([]LogEntry, 0, lines)
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
var raw map[string]any
if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil {
cclog.Debugf("error unmarshal log output: %v", err)
continue
}
priority := 6 // default info
if p, ok := raw["PRIORITY"]; ok {
switch v := p.(type) {
case string:
if n, err := strconv.Atoi(v); err == nil {
priority = n
}
case float64:
priority = int(v)
}
}
msg := ""
if m, ok := raw["MESSAGE"]; ok {
if s, ok := m.(string); ok {
msg = s
}
}
ts := ""
if t, ok := raw["__REALTIME_TIMESTAMP"]; ok {
if s, ok := t.(string); ok {
ts = s
}
}
unitName := ""
if u, ok := raw["_SYSTEMD_UNIT"]; ok {
if s, ok := u.(string); ok {
unitName = s
}
}
entries = append(entries, LogEntry{
Timestamp: ts,
Priority: priority,
Message: msg,
Unit: unitName,
})
}
if err := cmd.Wait(); err != nil {
// journalctl returns exit code 1 when --grep matches nothing
if len(entries) == 0 {
cclog.Debugf("journalctl exited with: %v", err)
}
}
rw.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(rw).Encode(entries); err != nil {
cclog.Errorf("Failed to encode log entries: %v", err)
}
}

137
internal/api/metricstore.go Normal file
View File

@@ -0,0 +1,137 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strconv"
"strings"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/influxdata/line-protocol/v2/lineprotocol"
)
// handleFree godoc
// @summary
// @tags free
// @description This endpoint allows the users to free the Buffers from the
// metric store. This endpoint offers the users to remove then systematically
// and also allows then to prune the data under node, if they do not want to
// remove the whole node.
// @produce json
// @param to query string false "up to timestamp"
// @success 200 {string} string "ok"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /free/ [post]
func freeMetrics(rw http.ResponseWriter, r *http.Request) {
rawTo := r.URL.Query().Get("to")
if rawTo == "" {
handleError(errors.New("'to' is a required query parameter"), http.StatusBadRequest, rw)
return
}
to, err := strconv.ParseInt(rawTo, 10, 64)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
bodyDec := json.NewDecoder(r.Body)
var selectors [][]string
err = bodyDec.Decode(&selectors)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
ms := metricstore.GetMemoryStore()
n := 0
for _, sel := range selectors {
bn, err := ms.Free(sel, to)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
n += bn
}
rw.WriteHeader(http.StatusOK)
fmt.Fprintf(rw, "buffers freed: %d\n", n)
}
// handleWrite godoc
// @summary Receive metrics in InfluxDB line-protocol
// @tags write
// @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)
// @accept plain
// @produce json
// @param cluster query string false "If the lines in the body do not have a cluster tag, use this value instead."
// @success 200 {string} string "ok"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /write/ [post]
func writeMetrics(rw http.ResponseWriter, r *http.Request) {
bytes, err := io.ReadAll(r.Body)
rw.Header().Add("Content-Type", "application/json")
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
ms := metricstore.GetMemoryStore()
dec := lineprotocol.NewDecoderWithBytes(bytes)
if err := metricstore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
cclog.Errorf("/api/write error: %s", err.Error())
handleError(err, http.StatusBadRequest, rw)
return
}
rw.WriteHeader(http.StatusOK)
}
// handleDebug godoc
// @summary Debug endpoint
// @tags debug
// @description This endpoint allows the users to print the content of
// nodes/clusters/metrics to review the state of the data.
// @produce json
// @param selector query string false "Selector"
// @success 200 {string} string "Debug dump"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /debug/ [post]
func debugMetrics(rw http.ResponseWriter, r *http.Request) {
raw := r.URL.Query().Get("selector")
rw.Header().Add("Content-Type", "application/json")
selector := []string{}
if len(raw) != 0 {
selector = strings.Split(raw, ":")
}
ms := metricstore.GetMemoryStore()
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
}

400
internal/api/nats.go Normal file
View File

@@ -0,0 +1,400 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"database/sql"
"encoding/json"
"strings"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"github.com/ClusterCockpit/cc-lib/v2/nats"
"github.com/ClusterCockpit/cc-lib/v2/receivers"
"github.com/ClusterCockpit/cc-lib/v2/schema"
influx "github.com/influxdata/line-protocol/v2/lineprotocol"
)
// NatsAPI provides NATS subscription-based handlers for Job and Node operations.
// It mirrors the functionality of the REST API but uses NATS messaging with
// InfluxDB line protocol as the message format.
//
// # Message Format
//
// All NATS messages use InfluxDB line protocol format (https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/)
// with the following structure:
//
// measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp
//
// # Job Events
//
// Job start/stop events use the "job" measurement with a "function" tag to distinguish operations:
//
// job,function=start_job event="{...JSON payload...}" <timestamp>
// job,function=stop_job event="{...JSON payload...}" <timestamp>
//
// The JSON payload in the "event" field follows the schema.Job or StopJobAPIRequest structure.
//
// Example job start message:
//
// job,function=start_job event="{\"jobId\":1001,\"user\":\"testuser\",\"cluster\":\"testcluster\",...}" 1234567890000000000
//
// # Node State Events
//
// Node state updates use the "nodestate" measurement with cluster information:
//
// nodestate event="{...JSON payload...}" <timestamp>
//
// The JSON payload follows the UpdateNodeStatesRequest structure.
//
// Example node state message:
//
// nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node01\",\"states\":[\"idle\"]}]}" 1234567890000000000
type NatsAPI struct {
JobRepository *repository.JobRepository
// RepositoryMutex protects job creation operations from race conditions
// when checking for duplicate jobs during startJob calls.
RepositoryMutex sync.Mutex
}
// NewNatsAPI creates a new NatsAPI instance with default dependencies.
func NewNatsAPI() *NatsAPI {
return &NatsAPI{
JobRepository: repository.GetJobRepository(),
}
}
// StartSubscriptions registers all NATS subscriptions for Job and Node APIs.
// Returns an error if the NATS client is not available or subscription fails.
func (api *NatsAPI) StartSubscriptions() error {
client := nats.GetClient()
if client == nil {
cclog.Warn("NATS client not available, skipping API subscriptions")
return nil
}
if config.Keys.APISubjects != nil {
s := config.Keys.APISubjects
if err := client.Subscribe(s.SubjectJobEvent, api.handleJobEvent); err != nil {
return err
}
if err := client.Subscribe(s.SubjectNodeState, api.handleNodeState); err != nil {
return err
}
cclog.Info("NATS API subscriptions started")
}
return nil
}
// processJobEvent routes job event messages to the appropriate handler based on the "function" tag.
// Validates that required tags and fields are present before processing.
func (api *NatsAPI) processJobEvent(msg lp.CCMessage) {
function, ok := msg.GetTag("function")
if !ok {
cclog.Errorf("Job event is missing required tag 'function': measurement=%s", msg.Name())
return
}
switch function {
case "start_job":
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Job start event is missing event field with JSON payload")
return
}
api.handleStartJob(v)
case "stop_job":
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Job stop event is missing event field with JSON payload")
return
}
api.handleStopJob(v)
default:
cclog.Warnf("Unknown job event function '%s', expected 'start_job' or 'stop_job'", function)
}
}
// handleJobEvent processes job-related messages received via NATS using InfluxDB line protocol.
// The message must be in line protocol format with measurement="job" and include:
// - tag "function" with value "start_job" or "stop_job"
// - field "event" containing JSON payload (schema.Job or StopJobAPIRequest)
//
// Example: job,function=start_job event="{\"jobId\":1001,...}" 1234567890000000000
func (api *NatsAPI) handleJobEvent(subject string, data []byte) {
if len(data) == 0 {
cclog.Warnf("NATS %s: received empty message", subject)
return
}
d := influx.NewDecoderWithBytes(data)
for d.Next() {
m, err := receivers.DecodeInfluxMessage(d)
if err != nil {
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
return
}
if !m.IsEvent() {
cclog.Debugf("NATS %s: received non-event message, skipping", subject)
continue
}
if m.Name() == "job" {
api.processJobEvent(m)
} else {
cclog.Debugf("NATS %s: unexpected measurement name '%s', expected 'job'", subject, m.Name())
}
}
}
// handleStartJob processes job start messages received via NATS.
// The payload parameter contains JSON following the schema.Job structure.
// Jobs are validated, checked for duplicates, and inserted into the database.
func (api *NatsAPI) handleStartJob(payload string) {
if payload == "" {
cclog.Error("NATS start job: payload is empty")
return
}
req := schema.Job{
Shared: "none",
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
dec := json.NewDecoder(strings.NewReader(payload))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS start job: parsing request failed: %v", err)
return
}
cclog.Debugf("NATS start job: %s", req.GoString())
req.State = schema.JobStateRunning
if err := importer.SanityChecks(&req); err != nil {
cclog.Errorf("NATS start job: sanity check failed: %v", err)
return
}
var unlockOnce sync.Once
api.RepositoryMutex.Lock()
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
if err != nil && err != sql.ErrNoRows {
cclog.Errorf("NATS start job: checking for duplicate failed: %v", err)
return
}
if err == nil {
for _, job := range jobs {
if (req.StartTime - job.StartTime) < secondsPerDay {
cclog.Errorf("NATS start job: job with jobId %d, cluster %s already exists (dbid: %d)",
req.JobID, req.Cluster, job.ID)
return
}
}
}
// When tags are present, insert directly into the job table so that the
// returned ID can be used with AddTagOrCreate (which queries the job table).
var id int64
if len(req.Tags) > 0 {
id, err = api.JobRepository.StartDirect(&req)
} else {
id, err = api.JobRepository.Start(&req)
}
if err != nil {
cclog.Errorf("NATS start job: insert into database failed: %v", err)
return
}
unlockOnce.Do(api.RepositoryMutex.Unlock)
for _, tag := range req.Tags {
if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
cclog.Errorf("NATS start job: adding tag to new job %d failed: %v", id, err)
return
}
}
cclog.Infof("NATS: new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
id, req.Cluster, req.JobID, req.User, req.StartTime)
}
// handleStopJob processes job stop messages received via NATS.
// The payload parameter contains JSON following the StopJobAPIRequest structure.
// The job is marked as stopped in the database and archiving is triggered if monitoring is enabled.
func (api *NatsAPI) handleStopJob(payload string) {
if payload == "" {
cclog.Error("NATS stop job: payload is empty")
return
}
var req StopJobAPIRequest
dec := json.NewDecoder(strings.NewReader(payload))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS job stop: parsing request failed: %v", err)
return
}
if req.JobID == nil {
cclog.Errorf("NATS job stop: the field 'jobId' is required")
return
}
isCached := false
job, err := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
if err != nil {
// Not in cache, try main job table
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
if err != nil {
cclog.Errorf("NATS job stop: finding job failed: %v", err)
return
}
} else {
isCached = true
}
if job.State != schema.JobStateRunning {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
job.JobID, job.ID, job.Cluster, job.State)
return
}
if job.StartTime > req.StopTime {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
return
}
if req.State != "" && !req.State.Valid() {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: invalid job state: %#v",
job.JobID, job.ID, job.Cluster, req.State)
return
} else if req.State == "" {
req.State = schema.JobStateCompleted
}
job.Duration = int32(req.StopTime - job.StartTime)
job.State = req.State
api.JobRepository.Mutex.Lock()
defer api.JobRepository.Mutex.Unlock()
// If the job is still in job_cache, transfer it to the job table first
if isCached {
newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
if err != nil {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v",
job.JobID, *job.ID, job.Cluster, err)
return
}
cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
job.ID = &newID
}
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
job.JobID, *job.ID, job.Cluster, job.State, err)
return
}
cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s",
*job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
return
}
archiver.TriggerArchiving(job)
}
// processNodestateEvent extracts and processes node state data from the InfluxDB message.
// Updates node states in the repository for all nodes in the payload.
func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Nodestate event is missing event field with JSON payload")
return
}
var req UpdateNodeStatesRequest
dec := json.NewDecoder(strings.NewReader(v))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS nodestate: parsing request failed: %v", err)
return
}
repo := repository.GetNodeRepository()
requestReceived := time.Now().Unix()
for _, node := range req.Nodes {
state := determineState(node.States)
nodeState := schema.NodeStateDB{
TimeStamp: requestReceived,
NodeState: state,
CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated,
GpusAllocated: node.GpusAllocated,
HealthState: schema.MonitoringStateFull,
JobsRunning: node.JobsRunning,
}
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
cclog.Errorf("NATS nodestate: updating node state for %s on %s failed: %v",
node.Hostname, req.Cluster, err)
}
}
cclog.Debugf("NATS nodestate: updated %d node states for cluster %s", len(req.Nodes), req.Cluster)
}
// handleNodeState processes node state update messages received via NATS using InfluxDB line protocol.
// The message must be in line protocol format with measurement="nodestate" and include:
// - field "event" containing JSON payload (UpdateNodeStatesRequest)
//
// Example: nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[...]}" 1234567890000000000
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
if len(data) == 0 {
cclog.Warnf("NATS %s: received empty message", subject)
return
}
d := influx.NewDecoderWithBytes(data)
for d.Next() {
m, err := receivers.DecodeInfluxMessage(d)
if err != nil {
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
return
}
if !m.IsEvent() {
cclog.Warnf("NATS %s: received non-event message, skipping", subject)
continue
}
if m.Name() == "nodestate" {
api.processNodestateEvent(m)
} else {
cclog.Warnf("NATS %s: unexpected measurement name '%s', expected 'nodestate'", subject, m.Name())
}
}
}

947
internal/api/nats_test.go Normal file
View File

@@ -0,0 +1,947 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"context"
"database/sql"
"encoding/json"
"fmt"
"os"
"path/filepath"
"testing"
"time"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"github.com/ClusterCockpit/cc-lib/v2/schema"
_ "github.com/mattn/go-sqlite3"
)
func setupNatsTest(t *testing.T) *NatsAPI {
repository.ResetConnection()
const testconfig = `{
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"api-allowed-ips": [
"*"
]
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2m"
}
}
}`
const testclusterJSON = `{
"name": "testcluster",
"subClusters": [
{
"name": "sc1",
"nodes": "host123,host124,host125",
"processorType": "Intel Core i7-4770",
"socketsPerNode": 1,
"coresPerSocket": 4,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"topology": {
"node": [0, 1, 2, 3, 4, 5, 6, 7],
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
"memoryDomain": [[0, 1, 2, 3, 4, 5, 6, 7]],
"die": [[0, 1, 2, 3, 4, 5, 6, 7]],
"core": [[0], [1], [2], [3], [4], [5], [6], [7]]
}
}
],
"metricConfig": [
{
"name": "load_one",
"unit": { "base": ""},
"scope": "node",
"timestep": 60,
"aggregation": "avg",
"peak": 8,
"normal": 0,
"caution": 0,
"alert": 0
}
]
}`
cclog.Init("info", true)
tmpdir := t.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
if err := os.Mkdir(jobarchive, 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
t.Fatal(err)
}
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
t.Fatal(err)
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
cfgFilePath := filepath.Join(tmpdir, "config.json")
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
t.Fatal(err)
}
ccconf.Init(cfgFilePath)
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
config.Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
repository.Connect(dbfilepath)
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
t.Fatal(err)
}
// metricstore initialization removed - it's initialized via callback in tests
archiver.Start(repository.GetJobRepository(), context.Background())
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
auth.Init(&cfg)
} else {
cclog.Warn("Authentication disabled due to missing configuration")
auth.Init(nil)
}
graph.Init()
return NewNatsAPI()
}
func cleanupNatsTest() {
if err := archiver.Shutdown(5 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
}
}
func TestNatsHandleStartJob(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
payload string
expectError bool
validateJob func(t *testing.T, job *schema.Job)
shouldFindJob bool
}{
{
name: "valid job start",
payload: `{
"jobId": 1001,
"user": "testuser1",
"project": "testproj1",
"cluster": "testcluster",
"partition": "main",
"walltime": 7200,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
}
],
"startTime": 1234567890
}`,
expectError: false,
shouldFindJob: true,
validateJob: func(t *testing.T, job *schema.Job) {
if job.JobID != 1001 {
t.Errorf("expected JobID 1001, got %d", job.JobID)
}
if job.User != "testuser1" {
t.Errorf("expected user testuser1, got %s", job.User)
}
if job.State != schema.JobStateRunning {
t.Errorf("expected state running, got %s", job.State)
}
},
},
{
name: "invalid JSON",
payload: `{
"jobId": "not a number",
"user": "testuser2"
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "missing required fields",
payload: `{
"jobId": 1002
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "job with unknown fields (should fail due to DisallowUnknownFields)",
payload: `{
"jobId": 1003,
"user": "testuser3",
"project": "testproj3",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"unknownField": "should cause error",
"startTime": 1234567900
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "job with tags",
payload: `{
"jobId": 1004,
"user": "testuser4",
"project": "testproj4",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"tags": [
{
"type": "test",
"name": "testtag",
"scope": "testuser4"
}
],
"startTime": 1234567910
}`,
expectError: false,
shouldFindJob: true,
validateJob: func(t *testing.T, job *schema.Job) {
if job.JobID != 1004 {
t.Errorf("expected JobID 1004, got %d", job.JobID)
}
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleStartJob(tt.payload)
natsAPI.JobRepository.SyncJobs()
// Allow some time for async operations
time.Sleep(100 * time.Millisecond)
if tt.shouldFindJob {
// Extract jobId from payload
var payloadMap map[string]any
json.Unmarshal([]byte(tt.payload), &payloadMap)
jobID := int64(payloadMap["jobId"].(float64))
cluster := payloadMap["cluster"].(string)
startTime := int64(payloadMap["startTime"].(float64))
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, &startTime)
if err != nil {
if !tt.expectError {
t.Fatalf("expected to find job, but got error: %v", err)
}
return
}
if tt.validateJob != nil {
tt.validateJob(t, job)
}
}
})
}
}
func TestNatsHandleStopJob(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
// First, create a running job
startPayload := `{
"jobId": 2001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
}
],
"startTime": 1234567890
}`
natsAPI.handleStartJob(startPayload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
tests := []struct {
name string
payload string
expectError bool
validateJob func(t *testing.T, job *schema.Job)
setupJobFunc func() // Optional: create specific test job
}{
{
name: "valid job stop - completed",
payload: `{
"jobId": 2001,
"cluster": "testcluster",
"startTime": 1234567890,
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: false,
validateJob: func(t *testing.T, job *schema.Job) {
if job.State != schema.JobStateCompleted {
t.Errorf("expected state completed, got %s", job.State)
}
expectedDuration := int32(1234571490 - 1234567890)
if job.Duration != expectedDuration {
t.Errorf("expected duration %d, got %d", expectedDuration, job.Duration)
}
},
},
{
name: "valid job stop - failed",
setupJobFunc: func() {
startPayloadFailed := `{
"jobId": 2002,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567900
}`
natsAPI.handleStartJob(startPayloadFailed)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2002,
"cluster": "testcluster",
"startTime": 1234567900,
"jobState": "failed",
"stopTime": 1234569900
}`,
expectError: false,
validateJob: func(t *testing.T, job *schema.Job) {
if job.State != schema.JobStateFailed {
t.Errorf("expected state failed, got %s", job.State)
}
},
},
{
name: "invalid JSON",
payload: `{
"jobId": "not a number"
}`,
expectError: true,
},
{
name: "missing jobId",
payload: `{
"cluster": "testcluster",
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: true,
},
{
name: "invalid job state",
setupJobFunc: func() {
startPayloadInvalid := `{
"jobId": 2003,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1]
}
],
"startTime": 1234567910
}`
natsAPI.handleStartJob(startPayloadInvalid)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2003,
"cluster": "testcluster",
"startTime": 1234567910,
"jobState": "invalid_state",
"stopTime": 1234571510
}`,
expectError: true,
},
{
name: "stopTime before startTime",
setupJobFunc: func() {
startPayloadTime := `{
"jobId": 2004,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0]
}
],
"startTime": 1234567920
}`
natsAPI.handleStartJob(startPayloadTime)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2004,
"cluster": "testcluster",
"startTime": 1234567920,
"jobState": "completed",
"stopTime": 1234567900
}`,
expectError: true,
},
{
name: "job not found",
payload: `{
"jobId": 99999,
"cluster": "testcluster",
"startTime": 1234567890,
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: true,
},
}
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
},
},
}
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.setupJobFunc != nil {
tt.setupJobFunc()
}
natsAPI.handleStopJob(tt.payload)
// Allow some time for async operations
time.Sleep(100 * time.Millisecond)
if !tt.expectError && tt.validateJob != nil {
// Extract job details from payload
var payloadMap map[string]any
json.Unmarshal([]byte(tt.payload), &payloadMap)
jobID := int64(payloadMap["jobId"].(float64))
cluster := payloadMap["cluster"].(string)
var startTime *int64
if st, ok := payloadMap["startTime"]; ok {
t := int64(st.(float64))
startTime = &t
}
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, startTime)
if err != nil {
t.Fatalf("expected to find job, but got error: %v", err)
}
tt.validateJob(t, job)
}
})
}
}
func TestNatsHandleNodeState(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
validateFn func(t *testing.T)
}{
{
name: "valid node state update",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"allocated\"],\"cpusAllocated\":8,\"memoryAllocated\":16384,\"gpusAllocated\":0,\"jobsRunning\":1}]}" 1234567890000000000`),
expectError: false,
validateFn: func(t *testing.T) {
// In a full test, we would verify the node state was updated in the database
// For now, just ensure no error occurred
},
},
{
name: "multiple nodes",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"host124\",\"states\":[\"allocated\"],\"cpusAllocated\":4,\"memoryAllocated\":8192,\"gpusAllocated\":1,\"jobsRunning\":1}]}" 1234567890000000000`),
expectError: false,
},
{
name: "invalid JSON in event field",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":\"not an array\"}" 1234567890000000000`),
expectError: true,
},
{
name: "empty nodes array",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1234567890000000000`),
expectError: false, // Empty array should not cause error
},
{
name: "invalid line protocol format",
data: []byte(`invalid line protocol format`),
expectError: true,
},
{
name: "empty data",
data: []byte(``),
expectError: false, // Should be handled gracefully with warning
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleNodeState("test.subject", tt.data)
// Allow some time for async operations
time.Sleep(50 * time.Millisecond)
if tt.validateFn != nil {
tt.validateFn(t)
}
})
}
}
func TestNatsProcessJobEvent(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
msgStartJob, err := lp.NewMessage(
"job",
map[string]string{"function": "start_job"},
nil,
map[string]any{
"event": `{
"jobId": 3001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567890
}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
msgMissingTag, err := lp.NewMessage(
"job",
map[string]string{},
nil,
map[string]any{
"event": `{}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
msgUnknownFunc, err := lp.NewMessage(
"job",
map[string]string{"function": "unknown_function"},
nil,
map[string]any{
"event": `{}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
tests := []struct {
name string
message lp.CCMessage
expectError bool
}{
{
name: "start_job function",
message: msgStartJob,
expectError: false,
},
{
name: "missing function tag",
message: msgMissingTag,
expectError: true,
},
{
name: "unknown function",
message: msgUnknownFunc,
expectError: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.processJobEvent(tt.message)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleJobEvent(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
}{
{
name: "valid influx line protocol",
data: []byte(`job,function=start_job event="{\"jobId\":4001,\"user\":\"testuser\",\"project\":\"testproj\",\"cluster\":\"testcluster\",\"partition\":\"main\",\"walltime\":3600,\"numNodes\":1,\"numHwthreads\":8,\"numAcc\":0,\"shared\":\"none\",\"monitoringStatus\":1,\"smt\":1,\"resources\":[{\"hostname\":\"host123\",\"hwthreads\":[0,1,2,3]}],\"startTime\":1234567890}" 1234567890000000000`),
expectError: false,
},
{
name: "invalid influx line protocol",
data: []byte(`invalid line protocol format`),
expectError: true,
},
{
name: "empty data",
data: []byte(``),
expectError: false, // Decoder should handle empty input gracefully
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// HandleJobEvent doesn't return errors, it logs them
// We're just ensuring it doesn't panic
natsAPI.handleJobEvent("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleJobEventEdgeCases(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
description string
}{
{
name: "non-event message (metric data)",
data: []byte(`job,function=start_job value=123.45 1234567890000000000`),
expectError: false,
description: "Should skip non-event messages gracefully",
},
{
name: "wrong measurement name",
data: []byte(`wrongmeasurement,function=start_job event="{}" 1234567890000000000`),
expectError: false,
description: "Should warn about unexpected measurement but not fail",
},
{
name: "missing event field",
data: []byte(`job,function=start_job other_field="value" 1234567890000000000`),
expectError: true,
description: "Should error when event field is missing",
},
{
name: "multiple measurements in one message",
data: []byte("job,function=start_job event=\"{}\" 1234567890000000000\njob,function=stop_job event=\"{}\" 1234567890000000000"),
expectError: false,
description: "Should process multiple lines",
},
{
name: "escaped quotes in JSON payload",
data: []byte(`job,function=start_job event="{\"jobId\":6001,\"user\":\"test\\\"user\",\"cluster\":\"test\"}" 1234567890000000000`),
expectError: true,
description: "Should handle escaped quotes (though JSON parsing may fail)",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleJobEvent("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleNodeStateEdgeCases(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
description string
}{
{
name: "missing cluster field in JSON",
data: []byte(`nodestate event="{\"nodes\":[]}" 1234567890000000000`),
expectError: true,
description: "Should fail when cluster is missing",
},
{
name: "malformed JSON with unescaped quotes",
data: []byte(`nodestate event="{\"cluster\":\"test"cluster\",\"nodes\":[]}" 1234567890000000000`),
expectError: true,
description: "Should fail on malformed JSON",
},
{
name: "unicode characters in hostname",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host-ñ123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
expectError: false,
description: "Should handle unicode characters",
},
{
name: "very large node count",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node1\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node2\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node3\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
expectError: false,
description: "Should handle multiple nodes efficiently",
},
{
name: "timestamp in past",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1000000000000000000`),
expectError: false,
description: "Should accept any valid timestamp",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleNodeState("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleStartJobDuplicatePrevention(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
// Start a job
payload := `{
"jobId": 5001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567890
}`
natsAPI.handleStartJob(payload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
// Try to start the same job again (within 24 hours)
duplicatePayload := `{
"jobId": 5001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567900
}`
natsAPI.handleStartJob(duplicatePayload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
// Verify only one job exists
jobID := int64(5001)
cluster := "testcluster"
jobs, err := natsAPI.JobRepository.FindAll(&jobID, &cluster, nil)
if err != nil && err != sql.ErrNoRows {
t.Fatalf("unexpected error: %v", err)
}
if len(jobs) != 1 {
t.Errorf("expected 1 job, got %d", len(jobs))
}
}

145
internal/api/node.go Normal file
View File

@@ -0,0 +1,145 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"fmt"
"maps"
"net/http"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type UpdateNodeStatesRequest struct {
Nodes []schema.NodePayload `json:"nodes"`
Cluster string `json:"cluster" example:"fritz"`
}
// metricListToNames converts a map of metric configurations to a list of metric names
func metricListToNames(metricList map[string]*schema.Metric) []string {
names := make([]string, 0, len(metricList))
for name := range metricList {
names = append(names, name)
}
return names
}
// this routine assumes that only one of them exists per node
func determineState(states []string) schema.SchedulerState {
for _, state := range states {
switch strings.ToLower(state) {
case "allocated":
return schema.NodeStateAllocated
case "reserved":
return schema.NodeStateReserved
case "idle":
return schema.NodeStateIdle
case "down":
return schema.NodeStateDown
case "mixed":
return schema.NodeStateMixed
}
}
return schema.NodeStateUnknown
}
// updateNodeStates godoc
// @summary Deliver updated Slurm node states
// @tags Nodestates
// @description Returns a JSON-encoded list of users.
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
// @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/nodestats/ [post]
func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
// Parse request body
req := UpdateNodeStatesRequest{}
if err := decode(r.Body, &req); err != nil {
handleError(fmt.Errorf("parsing request body failed: %w", err),
http.StatusBadRequest, rw)
return
}
requestReceived := time.Now().Unix()
repo := repository.GetNodeRepository()
m := make(map[string][]string)
metricNames := make(map[string][]string)
healthResults := make(map[string]metricstore.HealthCheckResult)
startMs := time.Now()
// Step 1: Build nodeList and metricList per subcluster
for _, node := range req.Nodes {
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
m[sc] = append(m[sc], node.Hostname)
}
}
for sc := range m {
if sc != "" {
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
metricNames[sc] = metricListToNames(metricList)
}
}
// Step 2: Determine which metric store to query and perform health check
healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
if err != nil {
cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
} else {
for sc, nl := range m {
if sc != "" {
if results, err := healthRepo.HealthCheck(req.Cluster, nl, metricNames[sc]); err == nil {
maps.Copy(healthResults, results)
}
}
}
}
cclog.Debugf("Timer updateNodeStates, MemStore HealthCheck: %s", time.Since(startMs))
startDB := time.Now()
for _, node := range req.Nodes {
state := determineState(node.States)
healthState := schema.MonitoringStateFailed
var healthMetrics string
if result, ok := healthResults[node.Hostname]; ok {
healthState = result.State
healthMetrics = result.HealthMetrics
}
nodeState := schema.NodeStateDB{
TimeStamp: requestReceived,
NodeState: state,
CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated,
GpusAllocated: node.GpusAllocated,
HealthState: healthState,
HealthMetrics: healthMetrics,
JobsRunning: node.JobsRunning,
}
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v",
node.Hostname, req.Cluster, err)
}
}
cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB))
}

File diff suppressed because it is too large Load Diff

221
internal/api/user.go Normal file
View File

@@ -0,0 +1,221 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"encoding/json"
"fmt"
"net/http"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-chi/chi/v5"
)
type APIReturnedUser struct {
Username string `json:"username"`
Name string `json:"name"`
Roles []string `json:"roles"`
Email string `json:"email"`
Projects []string `json:"projects"`
}
// getUsers godoc
// @summary Returns a list of users
// @tags User
// @description Returns a JSON-encoded list of users.
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
// @success 200 {array} api.APIReturnedUser "List of users returned successfully"
// @failure 400 {string} string "Bad Request"
// @failure 401 {string} string "Unauthorized"
// @failure 403 {string} string "Forbidden"
// @failure 500 {string} string "Internal Server Error"
// @security ApiKeyAuth
// @router /api/users/ [get]
func (api *RestAPI) getUsers(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to fetch a list of users"), http.StatusForbidden, rw)
return
}
users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true")
if err != nil {
handleError(fmt.Errorf("listing users failed: %w", err), http.StatusInternalServerError, rw)
return
}
rw.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(rw).Encode(users); err != nil {
cclog.Errorf("Failed to encode users response: %v", err)
}
}
// updateUser godoc
// @summary Update user roles and projects
// @tags User
// @description Allows admins to add/remove roles and projects for a user
// @produce plain
// @param id path string true "Username"
// @param add-role formData string false "Role to add"
// @param remove-role formData string false "Role to remove"
// @param add-project formData string false "Project to add"
// @param remove-project formData string false "Project to remove"
// @success 200 {string} string "Success message"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
// @security ApiKeyAuth
// @router /api/user/{id} [post]
func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to update a user"), http.StatusForbidden, rw)
return
}
// Get Values
newrole := r.FormValue("add-role")
delrole := r.FormValue("remove-role")
newproj := r.FormValue("add-project")
delproj := r.FormValue("remove-project")
rw.Header().Set("Content-Type", "application/json")
// Handle role updates
if newrole != "" {
if err := repository.GetUserRepository().AddRole(r.Context(), chi.URLParam(r, "id"), newrole); err != nil {
handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Add Role Success"}); err != nil {
cclog.Errorf("Failed to encode response: %v", err)
}
} else if delrole != "" {
if err := repository.GetUserRepository().RemoveRole(r.Context(), chi.URLParam(r, "id"), delrole); err != nil {
handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Remove Role Success"}); err != nil {
cclog.Errorf("Failed to encode response: %v", err)
}
} else if newproj != "" {
if err := repository.GetUserRepository().AddProject(r.Context(), chi.URLParam(r, "id"), newproj); err != nil {
handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Add Project Success"}); err != nil {
cclog.Errorf("Failed to encode response: %v", err)
}
} else if delproj != "" {
if err := repository.GetUserRepository().RemoveProject(r.Context(), chi.URLParam(r, "id"), delproj); err != nil {
handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Remove Project Success"}); err != nil {
cclog.Errorf("Failed to encode response: %v", err)
}
} else {
handleError(fmt.Errorf("no operation specified: must provide add-role, remove-role, add-project, or remove-project"), http.StatusBadRequest, rw)
}
}
// createUser godoc
// @summary Create a new user
// @tags User
// @description Creates a new user with specified credentials and role
// @produce plain
// @param username formData string true "Username"
// @param password formData string false "Password (not required for API users)"
// @param role formData string true "User role"
// @param name formData string false "Full name"
// @param email formData string false "Email address"
// @param project formData string false "Project (required for managers)"
// @success 200 {string} string "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
// @security ApiKeyAuth
// @router /api/users/ [post]
func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
rw.Header().Set("Content-Type", "text/plain")
me := repository.GetUserFromContext(r.Context())
if !me.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to create new users"), http.StatusForbidden, rw)
return
}
username, password, role, name, email, project := r.FormValue("username"),
r.FormValue("password"), r.FormValue("role"), r.FormValue("name"),
r.FormValue("email"), r.FormValue("project")
// Validate username length
if len(username) == 0 || len(username) > 100 {
handleError(fmt.Errorf("username must be between 1 and 100 characters"), http.StatusBadRequest, rw)
return
}
if len(password) == 0 && role != schema.GetRoleString(schema.RoleAPI) {
handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw)
return
}
if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) {
handleError(fmt.Errorf("only managers require a project (can be changed later)"), http.StatusBadRequest, rw)
return
} else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) {
handleError(fmt.Errorf("managers require a project to manage (can be changed later)"), http.StatusBadRequest, rw)
return
}
if err := repository.GetUserRepository().AddUser(&schema.User{
Username: username,
Name: name,
Password: password,
Email: email,
Projects: []string{project},
Roles: []string{role},
}); err != nil {
handleError(fmt.Errorf("adding user failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
fmt.Fprintf(rw, "User %v successfully created!\n", username)
}
// deleteUser godoc
// @summary Delete a user
// @tags User
// @description Deletes a user from the system
// @produce plain
// @param username formData string true "Username to delete"
// @success 200 {string} string "Success"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
// @security ApiKeyAuth
// @router /api/users/ [delete]
func (api *RestAPI) deleteUser(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to delete a user"), http.StatusForbidden, rw)
return
}
username := r.FormValue("username")
if err := repository.GetUserRepository().DelUser(username); err != nil {
handleError(fmt.Errorf("deleting user failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
rw.WriteHeader(http.StatusOK)
}

189
internal/archiver/README.md Normal file
View File

@@ -0,0 +1,189 @@
# Archiver Package
The `archiver` package provides asynchronous job archiving functionality for ClusterCockpit. When jobs complete, their metric data is archived from the metric store to a persistent archive backend (filesystem, S3, SQLite, etc.).
## Architecture
### Producer-Consumer Pattern
```
┌──────────────┐ TriggerArchiving() ┌───────────────┐
│ API Handler │ ───────────────────────▶ │ archiveChannel│
│ (Job Stop) │ │ (buffer: 128)│
└──────────────┘ └───────┬───────┘
┌─────────────────────────────────┘
┌──────────────────────┐
│ archivingWorker() │
│ (goroutine) │
└──────────┬───────────┘
1. Fetch job metadata
2. Load metric data
3. Calculate statistics
4. Archive to backend
5. Update database
6. Call hooks
```
### Components
- **archiveChannel**: Buffered channel (128 jobs) for async communication
- **archivePending**: WaitGroup tracking in-flight archiving operations
- **archivingWorker**: Background goroutine processing archiving requests
- **shutdownCtx**: Context for graceful cancellation during shutdown
## Usage
### Initialization
```go
// Start archiver with context for shutdown control
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
archiver.Start(jobRepository, ctx)
```
### Archiving a Job
```go
// Called automatically when a job completes
archiver.TriggerArchiving(job)
```
The function returns immediately. Actual archiving happens in the background.
### Graceful Shutdown
```go
// Shutdown with 10 second timeout
if err := archiver.Shutdown(10 * time.Second); err != nil {
log.Printf("Archiver shutdown timeout: %v", err)
}
```
**Shutdown process:**
1. Closes channel (rejects new jobs)
2. Waits for pending jobs (up to timeout)
3. Cancels context if timeout exceeded
4. Waits for worker to exit cleanly
## Configuration
### Channel Buffer Size
The archiving channel has a buffer of 128 jobs. If more than 128 jobs are queued simultaneously, `TriggerArchiving()` will block until space is available.
To adjust:
```go
// In archiveWorker.go Start() function
archiveChannel = make(chan *schema.Job, 256) // Increase buffer
```
### Scope Selection
Archive data scopes are automatically selected based on job size:
- **Node scope**: Always included
- **Core scope**: Included for jobs with ≤8 nodes (reduces data volume for large jobs)
- **Accelerator scope**: Included if job used accelerators (`NumAcc > 0`)
To adjust the node threshold:
```go
// In archiver.go ArchiveJob() function
if job.NumNodes <= 16 { // Change from 8 to 16
scopes = append(scopes, schema.MetricScopeCore)
}
```
### Resolution
Data is archived at the highest available resolution (typically 60s intervals). To change:
```go
// In archiver.go ArchiveJob() function
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 300)
// 0 = highest resolution
// 300 = 5-minute resolution
```
## Error Handling
### Automatic Retry
The archiver does **not** automatically retry failed archiving operations. If archiving fails:
1. Error is logged
2. Job is marked as `MonitoringStatusArchivingFailed` in database
3. Worker continues processing other jobs
### Manual Retry
To re-archive failed jobs, query for jobs with `MonitoringStatusArchivingFailed` and call `TriggerArchiving()` again.
## Performance Considerations
### Single Worker Thread
The archiver uses a single worker goroutine. For high-throughput systems:
- Large channel buffer (128) prevents blocking
- Archiving is typically I/O bound (writing to storage)
- Single worker prevents overwhelming storage backend
### Shutdown Timeout
Recommended timeout values:
- **Development**: 5-10 seconds
- **Production**: 10-30 seconds
- **High-load**: 30-60 seconds
Choose based on:
- Average archiving time per job
- Storage backend latency
- Acceptable shutdown delay
## Monitoring
### Logging
The archiver logs:
- **Info**: Startup, shutdown, successful completions
- **Debug**: Individual job archiving times
- **Error**: Archiving failures with job ID and reason
- **Warn**: Shutdown timeout exceeded
### Metrics
Monitor these signals for archiver health:
- Jobs with `MonitoringStatusArchivingFailed`
- Time from job stop to successful archive
- Shutdown timeout occurrences
## Thread Safety
All exported functions are safe for concurrent use:
- `Start()` - Safe to call once
- `TriggerArchiving()` - Safe from multiple goroutines
- `Shutdown()` - Safe to call once
Internal state is protected by:
- Channel synchronization (`archiveChannel`)
- WaitGroup for pending count (`archivePending`)
- Context for cancellation (`shutdownCtx`)
## Files
- **archiveWorker.go**: Worker lifecycle, channel management, shutdown logic
- **archiver.go**: Core archiving logic, metric loading, statistics calculation
## Dependencies
- `internal/repository`: Database operations for job metadata
- `internal/metricdispatch`: Loading metric data from various backends
- `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
- `cc-lib/schema`: Job and metric data structures

View File

@@ -1,17 +1,61 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package archiver provides asynchronous job archiving functionality for ClusterCockpit.
//
// The archiver runs a background worker goroutine that processes job archiving requests
// from a buffered channel. When jobs complete, their metric data is archived from the
// metric store to the configured archive backend (filesystem, S3, etc.).
//
// # Architecture
//
// The archiver uses a producer-consumer pattern:
// - Producer: TriggerArchiving() sends jobs to archiveChannel
// - Consumer: archivingWorker() processes jobs from the channel
// - Coordination: sync.WaitGroup tracks pending archive operations
//
// # Lifecycle
//
// 1. Start(repo, ctx) - Initialize worker with context for cancellation
// 2. TriggerArchiving(job) - Queue job for archiving (called when job stops)
// 3. archivingWorker() - Background goroutine processes jobs
// 4. Shutdown(timeout) - Graceful shutdown with timeout
//
// # Graceful Shutdown
//
// The archiver supports graceful shutdown with configurable timeout:
// - Closes channel to reject new jobs
// - Waits for pending jobs to complete (up to timeout)
// - Cancels context if timeout exceeded
// - Ensures worker goroutine exits cleanly
//
// # Example Usage
//
// // Initialize archiver
// ctx, cancel := context.WithCancel(context.Background())
// defer cancel()
// archiver.Start(jobRepository, ctx)
//
// // Trigger archiving when job completes
// archiver.TriggerArchiving(job)
//
// // Graceful shutdown with 10 second timeout
// if err := archiver.Shutdown(10 * time.Second); err != nil {
// log.Printf("Archiver shutdown timeout: %v", err)
// }
package archiver
import (
"context"
"fmt"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
sq "github.com/Masterminds/squirrel"
)
@@ -19,76 +63,188 @@ var (
archivePending sync.WaitGroup
archiveChannel chan *schema.Job
jobRepo *repository.JobRepository
shutdownCtx context.Context
shutdownCancel context.CancelFunc
workerDone chan struct{}
)
func Start(r *repository.JobRepository) {
// Start initializes the archiver and starts the background worker goroutine.
//
// The archiver processes job archiving requests asynchronously via a buffered channel.
// Jobs are sent to the channel using TriggerArchiving() and processed by the worker.
//
// Parameters:
// - r: JobRepository instance for database operations
// - ctx: Context for cancellation (shutdown signal propagation)
//
// The worker goroutine will run until:
// - ctx is cancelled (via parent shutdown)
// - archiveChannel is closed (via Shutdown())
//
// Must be called before TriggerArchiving(). Safe to call only once.
func Start(r *repository.JobRepository, ctx context.Context) {
shutdownCtx, shutdownCancel = context.WithCancel(ctx)
archiveChannel = make(chan *schema.Job, 128)
workerDone = make(chan struct{})
jobRepo = r
go archivingWorker()
}
// Archiving worker thread
// archivingWorker is the background goroutine that processes job archiving requests.
//
// The worker loop:
// 1. Blocks waiting for jobs on archiveChannel or shutdown signal
// 2. Fetches job metadata from repository
// 3. Archives job data to configured backend (calls ArchiveJob)
// 4. Updates job footprint and energy metrics in database
// 5. Marks job as successfully archived
// 6. Calls job stop hooks
//
// The worker exits when:
// - shutdownCtx is cancelled (timeout during shutdown)
// - archiveChannel is closed (normal shutdown)
//
// Errors during archiving are logged and the job is marked as failed,
// but the worker continues processing other jobs.
func archivingWorker() {
defer close(workerDone)
for {
select {
case <-shutdownCtx.Done():
cclog.Info("Archive worker received shutdown signal")
return
case job, ok := <-archiveChannel:
if !ok {
break
cclog.Info("Archive channel closed, worker exiting")
return
}
start := time.Now()
// not using meta data, called to load JobMeta into Cache?
// will fail if job meta not in repository
if _, err := jobRepo.FetchMetadata(job); err != nil {
log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", *job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
archivePending.Done()
continue
}
// ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
// TODO: Maybe use context with cancel/timeout here
jobMeta, err := ArchiveJob(job, context.Background())
// Use shutdown context to allow cancellation
jobMeta, err := ArchiveJob(job, shutdownCtx)
if err != nil {
log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", *job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
archivePending.Done()
continue
}
stmt := sq.Update("job").Where("job.id = ?", job.ID)
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
log.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", *job.ID, err.Error())
archivePending.Done()
continue
}
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
log.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", *job.ID, err.Error())
archivePending.Done()
continue
}
// Update the jobs database entry one last time:
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
if err := jobRepo.Execute(stmt); err != nil {
log.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", *job.ID, err.Error())
archivePending.Done()
continue
}
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
log.Printf("archiving job (dbid: %d) successful", job.ID)
cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
cclog.Infof("archiving job (dbid: %d) successful", *job.ID)
repository.CallJobStopHooks(job)
archivePending.Done()
}
}
}
// Trigger async archiving
// TriggerArchiving queues a job for asynchronous archiving.
//
// This function should be called when a job completes (stops) to archive its
// metric data from the metric store to the configured archive backend.
//
// The function:
// 1. Increments the pending job counter (WaitGroup)
// 2. Sends the job to the archiving channel (buffered, capacity 128)
// 3. Returns immediately (non-blocking unless channel is full)
//
// The actual archiving is performed asynchronously by the worker goroutine.
// Upon completion, the worker will decrement the pending counter.
//
// Panics if Start() has not been called first.
func TriggerArchiving(job *schema.Job) {
if archiveChannel == nil {
log.Fatal("Cannot archive without archiving channel. Did you Start the archiver?")
cclog.Fatal("Cannot archive without archiving channel. Did you Start the archiver?")
}
archivePending.Add(1)
archiveChannel <- job
}
// Wait for background thread to finish pending archiving operations
func WaitForArchiving() {
// close channel and wait for worker to process remaining jobs
archivePending.Wait()
// Shutdown performs a graceful shutdown of the archiver with a configurable timeout.
//
// The shutdown process:
// 1. Closes archiveChannel - no new jobs will be accepted
// 2. Waits for pending jobs to complete (up to timeout duration)
// 3. If timeout is exceeded:
// - Cancels shutdownCtx to interrupt ongoing ArchiveJob operations
// - Returns error indicating timeout
// 4. Waits for worker goroutine to exit cleanly
//
// Parameters:
// - timeout: Maximum duration to wait for pending jobs to complete
// (recommended: 10-30 seconds for production)
//
// Returns:
// - nil if all jobs completed within timeout
// - error if timeout was exceeded (some jobs may not have been archived)
//
// Jobs that don't complete within the timeout will be marked as failed.
// The function always ensures the worker goroutine exits before returning.
//
// Example:
//
// if err := archiver.Shutdown(10 * time.Second); err != nil {
// log.Printf("Some jobs did not complete: %v", err)
// }
func Shutdown(timeout time.Duration) error {
cclog.Info("Initiating archiver shutdown...")
// Close channel to signal no more jobs will be accepted
close(archiveChannel)
// Create a channel to signal when all jobs are done
done := make(chan struct{})
go func() {
archivePending.Wait()
close(done)
}()
// Wait for jobs to complete or timeout
select {
case <-done:
cclog.Info("All archive jobs completed successfully")
// Wait for worker to exit
<-workerDone
return nil
case <-time.After(timeout):
cclog.Warn("Archiver shutdown timeout exceeded, cancelling remaining operations")
// Cancel any ongoing operations
shutdownCancel()
// Wait for worker to exit
<-workerDone
return fmt.Errorf("archiver shutdown timeout after %v", timeout)
}
}

View File

@@ -1,22 +1,47 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package archiver
import (
"context"
"math"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// Writes a running job to the job-archive
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
// ArchiveJob archives a completed job's metric data to the configured archive backend.
//
// This function performs the following operations:
// 1. Loads all metric data for the job from the metric data repository
// 2. Calculates job-level statistics (avg, min, max) for each metric
// 3. Stores the job metadata and metric data to the archive backend
//
// Metric data is retrieved at the highest available resolution (typically 60s)
// for the following scopes:
// - Node scope (always)
// - Core scope (for jobs with ≤8 nodes, to reduce data volume)
// - Accelerator scope (if job used accelerators)
//
// The function respects context cancellation. If ctx is cancelled (e.g., during
// shutdown timeout), the operation will be interrupted and return an error.
//
// Parameters:
// - job: The job to archive (must be a completed job)
// - ctx: Context for cancellation and timeout control
//
// Returns:
// - *schema.Job with populated Statistics field
// - error if data loading or archiving fails
//
// If config.Keys.DisableArchive is true, only job statistics are calculated
// and returned (no data is written to archive backend).
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
allMetrics := make([]string, 0)
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
for _, mc := range metricConfigs {
@@ -34,17 +59,13 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
scopes = append(scopes, schema.MetricScopeAccelerator)
}
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
if err != nil {
log.Error("Error wile loading job data for archiving")
cclog.Error("Error wile loading job data for archiving")
return nil, err
}
jobMeta := &schema.JobMeta{
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Statistics: make(map[string]schema.JobStatistics),
}
job.Statistics = make(map[string]schema.JobStatistics)
for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
@@ -61,7 +82,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
}
// Round AVG Result to 2 Digits
jobMeta.Statistics[metric] = schema.JobStatistics{
job.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
@@ -72,12 +93,5 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
}
}
// If the file based archive is disabled,
// only return the JobMeta structure as the
// statistics in there are needed.
if config.Keys.DisableArchive {
return jobMeta, nil
}
return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData)
return job, archive.GetHandle().ImportJob(job, &jobData)
}

View File

@@ -1,15 +1,20 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package auth implements various authentication methods
package auth
import (
"bytes"
"context"
"crypto/rand"
"database/sql"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"net"
"net/http"
"os"
@@ -20,13 +25,25 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/gorilla/sessions"
)
// Authenticator is the interface for all authentication methods.
// Each authenticator determines if it can handle a login request (CanLogin)
// and performs the actual authentication (Login).
type Authenticator interface {
// CanLogin determines if this authenticator can handle the login request.
// It returns the user object if available and a boolean indicating if this
// authenticator should attempt the login. This method should not perform
// expensive operations or actual authentication.
CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool)
// Login performs the actually authentication for the user.
// It returns the authenticated user or an error if authentication fails.
// The user parameter may be nil if the user doesn't exist in the database yet.
Login(user *schema.User, rw http.ResponseWriter, r *http.Request) (*schema.User, error)
}
@@ -35,19 +52,70 @@ var (
authInstance *Authentication
)
var ipUserLimiters sync.Map
func getIPUserLimiter(ip, username string) *rate.Limiter {
key := ip + ":" + username
limiter, ok := ipUserLimiters.Load(key)
if !ok {
newLimiter := rate.NewLimiter(rate.Every(time.Hour/10), 10)
ipUserLimiters.Store(key, newLimiter)
return newLimiter
}
return limiter.(*rate.Limiter)
// rateLimiterEntry tracks a rate limiter and its last use time for cleanup
type rateLimiterEntry struct {
limiter *rate.Limiter
lastUsed time.Time
}
var ipUserLimiters sync.Map
// getIPUserLimiter returns a rate limiter for the given IP and username combination.
// Rate limiters are created on demand and track 5 attempts per 15 minutes.
func getIPUserLimiter(ip, username string) *rate.Limiter {
key := ip + ":" + username
now := time.Now()
if entry, ok := ipUserLimiters.Load(key); ok {
rle := entry.(*rateLimiterEntry)
rle.lastUsed = now
return rle.limiter
}
// More aggressive rate limiting: 5 attempts per 15 minutes
newLimiter := rate.NewLimiter(rate.Every(15*time.Minute/5), 5)
ipUserLimiters.Store(key, &rateLimiterEntry{
limiter: newLimiter,
lastUsed: now,
})
return newLimiter
}
// cleanupOldRateLimiters removes rate limiters that haven't been used recently
func cleanupOldRateLimiters(olderThan time.Time) {
ipUserLimiters.Range(func(key, value any) bool {
entry := value.(*rateLimiterEntry)
if entry.lastUsed.Before(olderThan) {
ipUserLimiters.Delete(key)
cclog.Debugf("Cleaned up rate limiter for %v", key)
}
return true
})
}
// startRateLimiterCleanup starts a background goroutine to clean up old rate limiters
func startRateLimiterCleanup() {
go func() {
ticker := time.NewTicker(1 * time.Hour)
defer ticker.Stop()
for range ticker.C {
// Clean up limiters not used in the last 24 hours
cleanupOldRateLimiters(time.Now().Add(-24 * time.Hour))
}
}()
}
// AuthConfig contains configuration for all authentication methods
type AuthConfig struct {
LdapConfig *LdapConfig `json:"ldap"`
JwtConfig *JWTAuthConfig `json:"jwts"`
OpenIDConfig *OpenIDConfig `json:"oidc"`
}
// Keys holds the global authentication configuration
var Keys AuthConfig
// Authentication manages all authentication methods and session handling
type Authentication struct {
sessionStore *sessions.CookieStore
LdapAuth *LdapAuthenticator
@@ -63,7 +131,7 @@ func (auth *Authentication) AuthViaSession(
) (*schema.User, error) {
session, err := auth.sessionStore.Get(r, "session")
if err != nil {
log.Error("Error while getting session store")
cclog.Error("Error while getting session store")
return nil, err
}
@@ -71,10 +139,31 @@ func (auth *Authentication) AuthViaSession(
return nil, nil
}
// TODO: Check if session keys exist
username, _ := session.Values["username"].(string)
projects, _ := session.Values["projects"].([]string)
roles, _ := session.Values["roles"].([]string)
// Validate session data with proper type checking
username, ok := session.Values["username"].(string)
if !ok || username == "" {
cclog.Warn("Invalid session: missing or invalid username")
// Invalidate the corrupted session
session.Options.MaxAge = -1
_ = auth.sessionStore.Save(r, rw, session)
return nil, errors.New("invalid session data")
}
projects, ok := session.Values["projects"].([]string)
if !ok {
cclog.Warn("Invalid session: projects not found or invalid type, using empty list")
projects = []string{}
}
roles, ok := session.Values["roles"].([]string)
if !ok || len(roles) == 0 {
cclog.Warn("Invalid session: missing or invalid roles")
// Invalidate the corrupted session
session.Options.MaxAge = -1
_ = auth.sessionStore.Save(r, rw, session)
return nil, errors.New("invalid session data")
}
return &schema.User{
Username: username,
Projects: projects,
@@ -84,22 +173,25 @@ func (auth *Authentication) AuthViaSession(
}, nil
}
func Init() {
func Init(authCfg *json.RawMessage) {
initOnce.Do(func() {
authInstance = &Authentication{}
// Start background cleanup of rate limiters
startRateLimiterCleanup()
sessKey := os.Getenv("SESSION_KEY")
if sessKey == "" {
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
cclog.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
bytes := make([]byte, 32)
if _, err := rand.Read(bytes); err != nil {
log.Fatal("Error while initializing authentication -> failed to generate random bytes for session key")
cclog.Fatal("Error while initializing authentication -> failed to generate random bytes for session key")
}
authInstance.sessionStore = sessions.NewCookieStore(bytes)
} else {
bytes, err := base64.StdEncoding.DecodeString(sessKey)
if err != nil {
log.Fatal("Error while initializing authentication -> decoding session key failed")
cclog.Fatal("Error while initializing authentication -> decoding session key failed")
}
authInstance.sessionStore = sessions.NewCookieStore(bytes)
}
@@ -108,44 +200,55 @@ func Init() {
authInstance.SessionMaxAge = d
}
if config.Keys.LdapConfig != nil {
if authCfg == nil {
return
}
config.Validate(configSchema, *authCfg)
dec := json.NewDecoder(bytes.NewReader(*authCfg))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Errorf("error while decoding ldap config: %v", err)
}
if Keys.LdapConfig != nil {
ldapAuth := &LdapAuthenticator{}
if err := ldapAuth.Init(); err != nil {
log.Warn("Error while initializing authentication -> ldapAuth init failed")
cclog.Warn("Error while initializing authentication -> ldapAuth init failed")
} else {
authInstance.LdapAuth = ldapAuth
authInstance.authenticators = append(authInstance.authenticators, authInstance.LdapAuth)
}
} else {
log.Info("Missing LDAP configuration: No LDAP support!")
cclog.Info("Missing LDAP configuration: No LDAP support!")
}
if config.Keys.JwtConfig != nil {
if Keys.JwtConfig != nil {
authInstance.JwtAuth = &JWTAuthenticator{}
if err := authInstance.JwtAuth.Init(); err != nil {
log.Fatal("Error while initializing authentication -> jwtAuth init failed")
cclog.Fatal("Error while initializing authentication -> jwtAuth init failed")
}
jwtSessionAuth := &JWTSessionAuthenticator{}
if err := jwtSessionAuth.Init(); err != nil {
log.Info("jwtSessionAuth init failed: No JWT login support!")
cclog.Info("jwtSessionAuth init failed: No JWT login support!")
} else {
authInstance.authenticators = append(authInstance.authenticators, jwtSessionAuth)
}
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
if err := jwtCookieSessionAuth.Init(); err != nil {
log.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
cclog.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
} else {
authInstance.authenticators = append(authInstance.authenticators, jwtCookieSessionAuth)
}
} else {
log.Info("Missing JWT configuration: No JWT token support!")
cclog.Info("Missing JWT configuration: No JWT token support!")
}
authInstance.LocalAuth = &LocalAuthenticator{}
if err := authInstance.LocalAuth.Init(); err != nil {
log.Fatal("Error while initializing authentication -> localAuth init failed")
cclog.Fatal("Error while initializing authentication -> localAuth init failed")
}
authInstance.authenticators = append(authInstance.authenticators, authInstance.LocalAuth)
})
@@ -153,50 +256,53 @@ func Init() {
func GetAuthInstance() *Authentication {
if authInstance == nil {
log.Fatal("Authentication module not initialized!")
cclog.Fatal("Authentication module not initialized!")
}
return authInstance
}
func handleTokenUser(tokenUser *schema.User) {
// handleUserSync syncs or updates a user in the database based on configuration.
// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled.
func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) {
r := repository.GetUserRepository()
dbUser, err := r.GetUser(tokenUser.Username)
dbUser, err := r.GetUser(user.Username)
if err != nil && err != sql.ErrNoRows {
log.Errorf("Error while loading user '%s': %v", tokenUser.Username, err)
} else if err == sql.ErrNoRows && config.Keys.JwtConfig.SyncUserOnLogin { // Adds New User
if err := r.AddUser(tokenUser); err != nil {
log.Errorf("Error while adding user '%s' to DB: %v", tokenUser.Username, err)
cclog.Errorf("Error while loading user '%s': %v", user.Username, err)
return
}
if err == sql.ErrNoRows && syncUserOnLogin { // Add new user
if err := r.AddUser(user); err != nil {
cclog.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
}
} else if err == nil && config.Keys.JwtConfig.UpdateUserOnLogin { // Update Existing User
if err := r.UpdateUser(dbUser, tokenUser); err != nil {
log.Errorf("Error while updating user '%s' to DB: %v", dbUser.Username, err)
} else if err == nil && updateUserOnLogin { // Update existing user
if err := r.UpdateUser(dbUser, user); err != nil {
cclog.Errorf("Error while updating user '%s' in DB: %v", dbUser.Username, err)
}
}
}
func handleOIDCUser(OIDCUser *schema.User) {
r := repository.GetUserRepository()
dbUser, err := r.GetUser(OIDCUser.Username)
// handleTokenUser syncs JWT token user with database
func handleTokenUser(tokenUser *schema.User) {
handleUserSync(tokenUser, Keys.JwtConfig.SyncUserOnLogin, Keys.JwtConfig.UpdateUserOnLogin)
}
if err != nil && err != sql.ErrNoRows {
log.Errorf("Error while loading user '%s': %v", OIDCUser.Username, err)
} else if err == sql.ErrNoRows && config.Keys.OpenIDConfig.SyncUserOnLogin { // Adds New User
if err := r.AddUser(OIDCUser); err != nil {
log.Errorf("Error while adding user '%s' to DB: %v", OIDCUser.Username, err)
}
} else if err == nil && config.Keys.OpenIDConfig.UpdateUserOnLogin { // Update Existing User
if err := r.UpdateUser(dbUser, OIDCUser); err != nil {
log.Errorf("Error while updating user '%s' to DB: %v", dbUser.Username, err)
}
}
// handleOIDCUser syncs OIDC user with database
func handleOIDCUser(OIDCUser *schema.User) {
handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin)
}
// handleLdapUser syncs LDAP user with database
func handleLdapUser(ldapUser *schema.User) {
handleUserSync(ldapUser, Keys.LdapConfig.SyncUserOnLogin, Keys.LdapConfig.UpdateUserOnLogin)
}
func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
session, err := auth.sessionStore.New(r, "session")
if err != nil {
log.Errorf("session creation failed: %s", err.Error())
cclog.Errorf("session creation failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return err
}
@@ -204,7 +310,13 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
if auth.SessionMaxAge != 0 {
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
}
if config.Keys.HttpsCertFile == "" && config.Keys.HttpsKeyFile == "" {
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
// If neither TLS or an encrypted reverse proxy are used, do not mark cookies as secure.
cclog.Warn("Authenticating with unencrypted request. Session cookies will not have Secure flag set (insecure for production)")
if r.Header.Get("X-Forwarded-Proto") == "" {
// This warning will not be printed if e.g. X-Forwarded-Proto == http
cclog.Warn("If you are using a reverse proxy, make sure X-Forwarded-Proto is set")
}
session.Options.Secure = false
}
session.Options.SameSite = http.SameSiteStrictMode
@@ -212,7 +324,7 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
session.Values["projects"] = user.Projects
session.Values["roles"] = user.Roles
if err := auth.sessionStore.Save(r, rw, session); err != nil {
log.Warnf("session save failed: %s", err.Error())
cclog.Warnf("session save failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return err
}
@@ -233,9 +345,9 @@ func (auth *Authentication) Login(
limiter := getIPUserLimiter(ip, username)
if !limiter.Allow() {
log.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username)
onfailure(rw, r, errors.New("Too many login attempts, try again in a few minutes."))
return
cclog.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username)
onfailure(rw, r, errors.New("too many login attempts, try again in a few minutes"))
return
}
var dbUser *schema.User
@@ -243,7 +355,7 @@ func (auth *Authentication) Login(
var err error
dbUser, err = repository.GetUserRepository().GetUser(username)
if err != nil && err != sql.ErrNoRows {
log.Errorf("Error while loading user '%v'", username)
cclog.Errorf("Error while loading user '%v'", username)
}
}
@@ -253,12 +365,12 @@ func (auth *Authentication) Login(
if user, ok = authenticator.CanLogin(dbUser, username, rw, r); !ok {
continue
} else {
log.Debugf("Can login with user %v", user)
cclog.Debugf("Can login with user %v", user)
}
user, err := authenticator.Login(user, rw, r)
if err != nil {
log.Warnf("user login failed: %s", err.Error())
cclog.Warnf("user login failed: %s", err.Error())
onfailure(rw, r, err)
return
}
@@ -267,7 +379,7 @@ func (auth *Authentication) Login(
return
}
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
if r.FormValue("redirect") != "" {
@@ -279,7 +391,7 @@ func (auth *Authentication) Login(
return
}
log.Debugf("login failed: no authenticator applied")
cclog.Debugf("login failed: no authenticator applied")
onfailure(rw, r, errors.New("no authenticator applied"))
})
}
@@ -291,14 +403,14 @@ func (auth *Authentication) Auth(
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
log.Infof("auth -> authentication failed: %s", err.Error())
cclog.Infof("auth -> authentication failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
if user == nil {
user, err = auth.AuthViaSession(rw, r)
if err != nil {
log.Infof("auth -> authentication failed: %s", err.Error())
cclog.Infof("auth -> authentication failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
@@ -309,89 +421,134 @@ func (auth *Authentication) Auth(
return
}
log.Info("auth -> authentication failed")
cclog.Info("auth -> authentication failed")
onfailure(rw, r, errors.New("unauthorized (please login first)"))
})
}
func (auth *Authentication) AuthApi(
func (auth *Authentication) AuthAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
log.Infof("auth api -> authentication failed: %s", err.Error())
cclog.Infof("auth api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
ipErr := securedCheck(user, r)
if ipErr != nil {
cclog.Infof("auth api -> secured check failed: %s", ipErr.Error())
onfailure(rw, r, ipErr)
return
}
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
if user.HasRole(schema.RoleAPI) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleAPI}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
default:
log.Info("auth api -> authentication failed: missing role")
cclog.Info("auth api -> authentication failed: missing role")
onfailure(rw, r, errors.New("unauthorized"))
}
}
log.Info("auth api -> authentication failed: no auth")
cclog.Info("auth api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthUserApi(
func (auth *Authentication) AuthUserAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
log.Infof("auth user api -> authentication failed: %s", err.Error())
cclog.Infof("auth user api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
if user.HasRole(schema.RoleAPI) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
default:
log.Info("auth user api -> authentication failed: missing role")
cclog.Info("auth user api -> authentication failed: missing role")
onfailure(rw, r, errors.New("unauthorized"))
}
}
log.Info("auth user api -> authentication failed: no auth")
cclog.Info("auth user api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthConfigApi(
func (auth *Authentication) AuthMetricStoreAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
cclog.Infof("auth metricstore api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleAPI) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
default:
cclog.Info("auth metricstore api -> authentication failed: missing role")
onfailure(rw, r, errors.New("unauthorized"))
}
}
cclog.Info("auth metricstore api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthConfigAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.AuthViaSession(rw, r)
if err != nil {
log.Infof("auth config api -> authentication failed: %s", err.Error())
cclog.Infof("auth config api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
@@ -400,19 +557,19 @@ func (auth *Authentication) AuthConfigApi(
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
log.Info("auth config api -> authentication failed: no auth")
cclog.Info("auth config api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthFrontendApi(
func (auth *Authentication) AuthFrontendAPI(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.AuthViaSession(rw, r)
if err != nil {
log.Infof("auth frontend api -> authentication failed: %s", err.Error())
cclog.Infof("auth frontend api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
@@ -421,7 +578,7 @@ func (auth *Authentication) AuthFrontendApi(
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
log.Info("auth frontend api -> authentication failed: no auth")
cclog.Info("auth frontend api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
@@ -445,3 +602,42 @@ func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
onsuccess.ServeHTTP(rw, r)
})
}
// Helper Moved To MiddleWare Auth Handlers
func securedCheck(user *schema.User, r *http.Request) error {
if user == nil {
return fmt.Errorf("no user for secured check")
}
// extract IP address for checking
IPAddress := r.Header.Get("X-Real-Ip")
if IPAddress == "" {
IPAddress = r.Header.Get("X-Forwarded-For")
}
if IPAddress == "" {
IPAddress = r.RemoteAddr
}
// Handle both IPv4 and IPv6 addresses properly
// For IPv6, this will strip the port and brackets
// For IPv4, this will strip the port
if host, _, err := net.SplitHostPort(IPAddress); err == nil {
IPAddress = host
}
// If SplitHostPort fails, IPAddress is already just a host (no port)
// If nothing declared in config: Continue
if len(config.Keys.APIAllowedIPs) == 0 {
return nil
}
// If wildcard declared in config: Continue
if config.Keys.APIAllowedIPs[0] == "*" {
return nil
}
// check if IP is allowed
if !util.Contains(config.Keys.APIAllowedIPs, IPAddress) {
return fmt.Errorf("unknown ip: %v", IPAddress)
}
return nil
}

176
internal/auth/auth_test.go Normal file
View File

@@ -0,0 +1,176 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"net"
"testing"
"time"
)
// TestGetIPUserLimiter tests the rate limiter creation and retrieval
func TestGetIPUserLimiter(t *testing.T) {
ip := "192.168.1.1"
username := "testuser"
// Get limiter for the first time
limiter1 := getIPUserLimiter(ip, username)
if limiter1 == nil {
t.Fatal("Expected limiter to be created")
}
// Get the same limiter again
limiter2 := getIPUserLimiter(ip, username)
if limiter1 != limiter2 {
t.Error("Expected to get the same limiter instance")
}
// Get a different limiter for different user
limiter3 := getIPUserLimiter(ip, "otheruser")
if limiter1 == limiter3 {
t.Error("Expected different limiter for different user")
}
// Get a different limiter for different IP
limiter4 := getIPUserLimiter("192.168.1.2", username)
if limiter1 == limiter4 {
t.Error("Expected different limiter for different IP")
}
}
// TestRateLimiterBehavior tests that rate limiting works correctly
func TestRateLimiterBehavior(t *testing.T) {
ip := "10.0.0.1"
username := "ratelimituser"
limiter := getIPUserLimiter(ip, username)
// Should allow first 5 attempts
for i := range 5 {
if !limiter.Allow() {
t.Errorf("Request %d should be allowed within rate limit", i+1)
}
}
// 6th attempt should be blocked
if limiter.Allow() {
t.Error("Request 6 should be blocked by rate limiter")
}
}
// TestCleanupOldRateLimiters tests the cleanup function
func TestCleanupOldRateLimiters(t *testing.T) {
// Clear all existing limiters first to avoid interference from other tests
cleanupOldRateLimiters(time.Now().Add(24 * time.Hour))
// Create some new rate limiters
limiter1 := getIPUserLimiter("1.1.1.1", "user1")
limiter2 := getIPUserLimiter("2.2.2.2", "user2")
if limiter1 == nil || limiter2 == nil {
t.Fatal("Failed to create test limiters")
}
// Cleanup limiters older than 1 second from now (should keep both)
time.Sleep(10 * time.Millisecond) // Small delay to ensure timestamp difference
cleanupOldRateLimiters(time.Now().Add(-1 * time.Second))
// Verify they still exist (should get same instance)
if getIPUserLimiter("1.1.1.1", "user1") != limiter1 {
t.Error("Limiter 1 was incorrectly cleaned up")
}
if getIPUserLimiter("2.2.2.2", "user2") != limiter2 {
t.Error("Limiter 2 was incorrectly cleaned up")
}
// Cleanup limiters older than 1 hour from now (should remove both)
cleanupOldRateLimiters(time.Now().Add(2 * time.Hour))
// Getting them again should create new instances
newLimiter1 := getIPUserLimiter("1.1.1.1", "user1")
if newLimiter1 == limiter1 {
t.Error("Old limiter should have been cleaned up")
}
}
// TestIPv4Extraction tests extracting IPv4 addresses
func TestIPv4Extraction(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{"IPv4 with port", "192.168.1.1:8080", "192.168.1.1"},
{"IPv4 without port", "192.168.1.1", "192.168.1.1"},
{"Localhost with port", "127.0.0.1:3000", "127.0.0.1"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.input
if host, _, err := net.SplitHostPort(result); err == nil {
result = host
}
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
})
}
}
// TestIPv6Extraction tests extracting IPv6 addresses
func TestIPv6Extraction(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{"IPv6 with port", "[2001:db8::1]:8080", "2001:db8::1"},
{"IPv6 localhost with port", "[::1]:3000", "::1"},
{"IPv6 without port", "2001:db8::1", "2001:db8::1"},
{"IPv6 localhost", "::1", "::1"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.input
if host, _, err := net.SplitHostPort(result); err == nil {
result = host
}
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
})
}
}
// TestIPExtractionEdgeCases tests edge cases for IP extraction
func TestIPExtractionEdgeCases(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{"Hostname without port", "example.com", "example.com"},
{"Empty string", "", ""},
{"Just port", ":8080", ""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.input
if host, _, err := net.SplitHostPort(result); err == nil {
result = host
}
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
})
}
}

View File

@@ -1,7 +1,8 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
@@ -13,13 +14,33 @@ import (
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
type JWTAuthConfig struct {
// Specifies for how long a JWT token shall be valid
// as a string parsable by time.ParseDuration().
MaxAge string `json:"max-age"`
// Specifies which cookie should be checked for a JWT token (if no authorization header is present)
CookieName string `json:"cookie-name"`
// Deny login for users not in database (but defined in JWT).
// Ignore user roles defined in JWTs ('roles' claim), get them from db.
ValidateUser bool `json:"validate-user"`
// Specifies which issuer should be accepted when validating external JWTs ('iss' claim)
TrustedIssuer string `json:"trusted-issuer"`
// Should an non-existent user be added to the DB based on the information in the token
SyncUserOnLogin bool `json:"sync-user-on-login"`
// Should an existent user be updated in the DB based on the information in the token
UpdateUserOnLogin bool `json:"update-user-on-login"`
}
type JWTAuthenticator struct {
publicKey ed25519.PublicKey
privateKey ed25519.PrivateKey
@@ -28,17 +49,17 @@ type JWTAuthenticator struct {
func (ja *JWTAuthenticator) Init() error {
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
if pubKey == "" || privKey == "" {
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
} else {
bytes, err := base64.StdEncoding.DecodeString(pubKey)
if err != nil {
log.Warn("Could not decode JWT public key")
cclog.Warn("Could not decode JWT public key")
return err
}
ja.publicKey = ed25519.PublicKey(bytes)
bytes, err = base64.StdEncoding.DecodeString(privKey)
if err != nil {
log.Warn("Could not decode JWT private key")
cclog.Warn("Could not decode JWT private key")
return err
}
ja.privateKey = ed25519.PrivateKey(bytes)
@@ -62,7 +83,7 @@ func (ja *JWTAuthenticator) AuthViaJWT(
return nil, nil
}
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
if t.Method != jwt.SigningMethodEdDSA {
return nil, errors.New("only Ed25519/EdDSA supported")
}
@@ -70,51 +91,34 @@ func (ja *JWTAuthenticator) AuthViaJWT(
return ja.publicKey, nil
})
if err != nil {
log.Warn("Error while parsing JWT token")
cclog.Warn("Error while parsing JWT token")
return nil, err
}
if !token.Valid {
log.Warn("jwt token claims are not valid")
cclog.Warn("jwt token claims are not valid")
return nil, errors.New("jwt token claims are not valid")
}
// Token is valid, extract payload
claims := token.Claims.(jwt.MapClaims)
sub, _ := claims["sub"].(string)
var roles []string
// Validate user + roles from JWT against database?
if config.Keys.JwtConfig.ValidateUser {
ur := repository.GetUserRepository()
user, err := ur.GetUser(sub)
// Deny any logins for unknown usernames
if err != nil {
log.Warn("Could not find user from JWT in internal database.")
return nil, errors.New("unknown user")
}
// Take user roles from database instead of trusting the JWT
roles = user.Roles
} else {
// Extract roles from JWT (if present)
if rawroles, ok := claims["roles"].([]interface{}); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
roles = append(roles, r)
}
}
}
// Use shared helper to get user from JWT claims
var user *schema.User
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthToken, -1)
if err != nil {
return nil, err
}
return &schema.User{
Username: sub,
Roles: roles,
AuthType: schema.AuthToken,
AuthSource: -1,
}, nil
// If not validating user, we only get roles from JWT (no projects for this auth method)
if !Keys.JwtConfig.ValidateUser {
user.Roles = extractRolesFromClaims(claims, false)
user.Projects = nil // Standard JWT auth doesn't include projects
}
return user, nil
}
// Generate a new JWT that can be used for authentication
// ProvideJWT generates a new JWT that can be used for authentication
func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
if ja.privateKey == nil {
return "", errors.New("environment variable 'JWT_PRIVATE_KEY' not set")
@@ -126,8 +130,8 @@ func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
"roles": user.Roles,
"iat": now.Unix(),
}
if config.Keys.JwtConfig.MaxAge != "" {
d, err := time.ParseDuration(config.Keys.JwtConfig.MaxAge)
if Keys.JwtConfig.MaxAge != "" {
d, err := time.ParseDuration(Keys.JwtConfig.MaxAge)
if err != nil {
return "", errors.New("cannot parse max-age config key")
}

View File

@@ -1,22 +1,19 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"crypto/ed25519"
"database/sql"
"encoding/base64"
"errors"
"fmt"
"net/http"
"os"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
@@ -31,18 +28,18 @@ var _ Authenticator = (*JWTCookieSessionAuthenticator)(nil)
func (ja *JWTCookieSessionAuthenticator) Init() error {
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
if pubKey == "" || privKey == "" {
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
return errors.New("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
} else {
bytes, err := base64.StdEncoding.DecodeString(pubKey)
if err != nil {
log.Warn("Could not decode JWT public key")
cclog.Warn("Could not decode JWT public key")
return err
}
ja.publicKey = ed25519.PublicKey(bytes)
bytes, err = base64.StdEncoding.DecodeString(privKey)
if err != nil {
log.Warn("Could not decode JWT private key")
cclog.Warn("Could not decode JWT private key")
return err
}
ja.privateKey = ed25519.PrivateKey(bytes)
@@ -53,36 +50,35 @@ func (ja *JWTCookieSessionAuthenticator) Init() error {
if keyFound && pubKeyCrossLogin != "" {
bytes, err := base64.StdEncoding.DecodeString(pubKeyCrossLogin)
if err != nil {
log.Warn("Could not decode cross login JWT public key")
cclog.Warn("Could not decode cross login JWT public key")
return err
}
ja.publicKeyCrossLogin = ed25519.PublicKey(bytes)
} else {
ja.publicKeyCrossLogin = nil
log.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
cclog.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
return errors.New("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
}
jc := config.Keys.JwtConfig
// Warn if other necessary settings are not configured
if jc != nil {
if jc.CookieName == "" {
log.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
if Keys.JwtConfig != nil {
if Keys.JwtConfig.CookieName == "" {
cclog.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
return errors.New("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
}
if !jc.ValidateUser {
log.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
if !Keys.JwtConfig.ValidateUser {
cclog.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
}
if jc.TrustedIssuer == "" {
log.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
if Keys.JwtConfig.TrustedIssuer == "" {
cclog.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
return errors.New("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
}
} else {
log.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
cclog.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
return errors.New("config for JWTs not configured (cross login via JWT cookie will fail)")
}
log.Info("JWT Cookie Session authenticator successfully registered")
cclog.Info("JWT Cookie Session authenticator successfully registered")
return nil
}
@@ -92,7 +88,7 @@ func (ja *JWTCookieSessionAuthenticator) CanLogin(
rw http.ResponseWriter,
r *http.Request,
) (*schema.User, bool) {
jc := config.Keys.JwtConfig
jc := Keys.JwtConfig
cookieName := ""
if jc.CookieName != "" {
cookieName = jc.CookieName
@@ -115,7 +111,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
rw http.ResponseWriter,
r *http.Request,
) (*schema.User, error) {
jc := config.Keys.JwtConfig
jc := Keys.JwtConfig
jwtCookie, err := r.Cookie(jc.CookieName)
var rawtoken string
@@ -123,7 +119,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
rawtoken = jwtCookie.Value
}
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
if t.Method != jwt.SigningMethodEdDSA {
return nil, errors.New("only Ed25519/EdDSA supported")
}
@@ -140,67 +136,26 @@ func (ja *JWTCookieSessionAuthenticator) Login(
return ja.publicKey, nil
})
if err != nil {
log.Warn("JWT cookie session: error while parsing token")
cclog.Warn("JWT cookie session: error while parsing token")
return nil, err
}
if !token.Valid {
log.Warn("jwt token claims are not valid")
cclog.Warn("jwt token claims are not valid")
return nil, errors.New("jwt token claims are not valid")
}
claims := token.Claims.(jwt.MapClaims)
sub, _ := claims["sub"].(string)
var roles []string
projects := make([]string, 0)
// Use shared helper to get user from JWT claims
user, err = getUserFromJWT(claims, jc.ValidateUser, schema.AuthSession, schema.AuthViaToken)
if err != nil {
return nil, err
}
if jc.ValidateUser {
var err error
user, err = repository.GetUserRepository().GetUser(sub)
if err != nil && err != sql.ErrNoRows {
log.Errorf("Error while loading user '%v'", sub)
}
// Deny any logins for unknown usernames
if user == nil {
log.Warn("Could not find user from JWT in internal database.")
return nil, errors.New("unknown user")
}
} else {
var name string
if wrap, ok := claims["name"].(map[string]interface{}); ok {
if vals, ok := wrap["values"].([]interface{}); ok {
if len(vals) != 0 {
name = fmt.Sprintf("%v", vals[0])
for i := 1; i < len(vals); i++ {
name += fmt.Sprintf(" %v", vals[i])
}
}
}
}
// Extract roles from JWT (if present)
if rawroles, ok := claims["roles"].([]interface{}); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
roles = append(roles, r)
}
}
}
user = &schema.User{
Username: sub,
Name: name,
Roles: roles,
Projects: projects,
AuthType: schema.AuthSession,
AuthSource: schema.AuthViaToken,
}
if jc.SyncUserOnLogin || jc.UpdateUserOnLogin {
handleTokenUser(user)
}
// Sync or update user if configured
if !jc.ValidateUser && (jc.SyncUserOnLogin || jc.UpdateUserOnLogin) {
handleTokenUser(user)
}
// (Ask browser to) Delete JWT cookie

138
internal/auth/jwtHelpers.go Normal file
View File

@@ -0,0 +1,138 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"database/sql"
"errors"
"fmt"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
// extractStringFromClaims extracts a string value from JWT claims
func extractStringFromClaims(claims jwt.MapClaims, key string) string {
if val, ok := claims[key].(string); ok {
return val
}
return ""
}
// extractRolesFromClaims extracts roles from JWT claims
// If validateRoles is true, only valid roles are returned
func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
var roles []string
if rawroles, ok := claims["roles"].([]any); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
if validateRoles {
if schema.IsValidRole(r) {
roles = append(roles, r)
}
} else {
roles = append(roles, r)
}
}
}
}
return roles
}
// extractProjectsFromClaims extracts projects from JWT claims
func extractProjectsFromClaims(claims jwt.MapClaims) []string {
projects := make([]string, 0)
if rawprojs, ok := claims["projects"].([]any); ok {
for _, pp := range rawprojs {
if p, ok := pp.(string); ok {
projects = append(projects, p)
}
}
} else if rawprojs, ok := claims["projects"]; ok {
if projSlice, ok := rawprojs.([]string); ok {
projects = append(projects, projSlice...)
}
}
return projects
}
// extractNameFromClaims extracts name from JWT claims
// Handles both simple string and complex nested structure
func extractNameFromClaims(claims jwt.MapClaims) string {
// Try simple string first
if name, ok := claims["name"].(string); ok {
return name
}
// Try nested structure: {name: {values: [...]}}
if wrap, ok := claims["name"].(map[string]any); ok {
if vals, ok := wrap["values"].([]any); ok {
if len(vals) == 0 {
return ""
}
var name strings.Builder
name.WriteString(fmt.Sprintf("%v", vals[0]))
for i := 1; i < len(vals); i++ {
name.WriteString(fmt.Sprintf(" %v", vals[i]))
}
return name.String()
}
}
return ""
}
// getUserFromJWT creates or retrieves a user based on JWT claims
// If validateUser is true, the user must exist in the database
// Otherwise, a new user object is created from claims
// authSource should be a schema.AuthSource constant (like schema.AuthViaToken)
func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.AuthType, authSource schema.AuthSource) (*schema.User, error) {
sub := extractStringFromClaims(claims, "sub")
if sub == "" {
return nil, errors.New("missing 'sub' claim in JWT")
}
if validateUser {
// Validate user against database
ur := repository.GetUserRepository()
user, err := ur.GetUser(sub)
if err != nil && err != sql.ErrNoRows {
cclog.Errorf("Error while loading user '%v': %v", sub, err)
return nil, fmt.Errorf("database error: %w", err)
}
// Deny any logins for unknown usernames
if user == nil || err == sql.ErrNoRows {
cclog.Warn("Could not find user from JWT in internal database.")
return nil, errors.New("unknown user")
}
// Return database user (with database roles)
return user, nil
}
// Create user from JWT claims
name := extractNameFromClaims(claims)
roles := extractRolesFromClaims(claims, true) // Validate roles
projects := extractProjectsFromClaims(claims)
return &schema.User{
Username: sub,
Name: name,
Roles: roles,
Projects: projects,
AuthType: authType,
AuthSource: authSource,
}, nil
}

View File

@@ -0,0 +1,280 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"testing"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
// TestExtractStringFromClaims tests extracting string values from JWT claims
func TestExtractStringFromClaims(t *testing.T) {
claims := jwt.MapClaims{
"sub": "testuser",
"email": "test@example.com",
"age": 25, // not a string
}
tests := []struct {
name string
key string
expected string
}{
{"Existing string", "sub", "testuser"},
{"Another string", "email", "test@example.com"},
{"Non-existent key", "missing", ""},
{"Non-string value", "age", ""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractStringFromClaims(claims, tt.key)
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
})
}
}
// TestExtractRolesFromClaims tests role extraction and validation
func TestExtractRolesFromClaims(t *testing.T) {
tests := []struct {
name string
claims jwt.MapClaims
validateRoles bool
expected []string
}{
{
name: "Valid roles without validation",
claims: jwt.MapClaims{
"roles": []any{"admin", "user", "invalid_role"},
},
validateRoles: false,
expected: []string{"admin", "user", "invalid_role"},
},
{
name: "Valid roles with validation",
claims: jwt.MapClaims{
"roles": []any{"admin", "user", "api"},
},
validateRoles: true,
expected: []string{"admin", "user", "api"},
},
{
name: "Invalid roles with validation",
claims: jwt.MapClaims{
"roles": []any{"invalid_role", "fake_role"},
},
validateRoles: true,
expected: []string{}, // Should filter out invalid roles
},
{
name: "No roles claim",
claims: jwt.MapClaims{},
validateRoles: false,
expected: []string{},
},
{
name: "Non-array roles",
claims: jwt.MapClaims{
"roles": "admin",
},
validateRoles: false,
expected: []string{},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractRolesFromClaims(tt.claims, tt.validateRoles)
if len(result) != len(tt.expected) {
t.Errorf("Expected %d roles, got %d", len(tt.expected), len(result))
return
}
for i, role := range result {
if i >= len(tt.expected) || role != tt.expected[i] {
t.Errorf("Expected role %s at position %d, got %s", tt.expected[i], i, role)
}
}
})
}
}
// TestExtractProjectsFromClaims tests project extraction from claims
func TestExtractProjectsFromClaims(t *testing.T) {
tests := []struct {
name string
claims jwt.MapClaims
expected []string
}{
{
name: "Projects as array of interfaces",
claims: jwt.MapClaims{
"projects": []any{"project1", "project2", "project3"},
},
expected: []string{"project1", "project2", "project3"},
},
{
name: "Projects as string array",
claims: jwt.MapClaims{
"projects": []string{"projectA", "projectB"},
},
expected: []string{"projectA", "projectB"},
},
{
name: "No projects claim",
claims: jwt.MapClaims{},
expected: []string{},
},
{
name: "Mixed types in projects array",
claims: jwt.MapClaims{
"projects": []any{"project1", 123, "project2"},
},
expected: []string{"project1", "project2"}, // Should skip non-strings
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractProjectsFromClaims(tt.claims)
if len(result) != len(tt.expected) {
t.Errorf("Expected %d projects, got %d", len(tt.expected), len(result))
return
}
for i, project := range result {
if i >= len(tt.expected) || project != tt.expected[i] {
t.Errorf("Expected project %s at position %d, got %s", tt.expected[i], i, project)
}
}
})
}
}
// TestExtractNameFromClaims tests name extraction from various formats
func TestExtractNameFromClaims(t *testing.T) {
tests := []struct {
name string
claims jwt.MapClaims
expected string
}{
{
name: "Simple string name",
claims: jwt.MapClaims{
"name": "John Doe",
},
expected: "John Doe",
},
{
name: "Nested name structure",
claims: jwt.MapClaims{
"name": map[string]any{
"values": []any{"John", "Doe"},
},
},
expected: "John Doe",
},
{
name: "Nested name with single value",
claims: jwt.MapClaims{
"name": map[string]any{
"values": []any{"Alice"},
},
},
expected: "Alice",
},
{
name: "No name claim",
claims: jwt.MapClaims{},
expected: "",
},
{
name: "Empty nested values",
claims: jwt.MapClaims{
"name": map[string]any{
"values": []any{},
},
},
expected: "",
},
{
name: "Nested with non-string values",
claims: jwt.MapClaims{
"name": map[string]any{
"values": []any{123, "Smith"},
},
},
expected: "123 Smith", // Should convert to string
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractNameFromClaims(tt.claims)
if result != tt.expected {
t.Errorf("Expected '%s', got '%s'", tt.expected, result)
}
})
}
}
// TestGetUserFromJWT_NoValidation tests getUserFromJWT without database validation
func TestGetUserFromJWT_NoValidation(t *testing.T) {
claims := jwt.MapClaims{
"sub": "testuser",
"name": "Test User",
"roles": []any{"user", "admin"},
"projects": []any{"project1", "project2"},
}
user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if user.Username != "testuser" {
t.Errorf("Expected username 'testuser', got '%s'", user.Username)
}
if user.Name != "Test User" {
t.Errorf("Expected name 'Test User', got '%s'", user.Name)
}
if len(user.Roles) != 2 {
t.Errorf("Expected 2 roles, got %d", len(user.Roles))
}
if len(user.Projects) != 2 {
t.Errorf("Expected 2 projects, got %d", len(user.Projects))
}
if user.AuthType != schema.AuthToken {
t.Errorf("Expected AuthType %v, got %v", schema.AuthToken, user.AuthType)
}
}
// TestGetUserFromJWT_MissingSub tests error when sub claim is missing
func TestGetUserFromJWT_MissingSub(t *testing.T) {
claims := jwt.MapClaims{
"name": "Test User",
}
_, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
if err == nil {
t.Error("Expected error for missing sub claim")
}
if err.Error() != "missing 'sub' claim in JWT" {
t.Errorf("Expected specific error message, got: %v", err)
}
}

View File

@@ -1,11 +1,11 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"database/sql"
"encoding/base64"
"errors"
"fmt"
@@ -13,10 +13,8 @@ import (
"os"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
@@ -30,13 +28,13 @@ func (ja *JWTSessionAuthenticator) Init() error {
if pubKey := os.Getenv("CROSS_LOGIN_JWT_HS512_KEY"); pubKey != "" {
bytes, err := base64.StdEncoding.DecodeString(pubKey)
if err != nil {
log.Warn("Could not decode cross login JWT HS512 key")
cclog.Warn("Could not decode cross login JWT HS512 key")
return err
}
ja.loginTokenKey = bytes
}
log.Info("JWT Session authenticator successfully registered")
cclog.Info("JWT Session authenticator successfully registered")
return nil
}
@@ -60,87 +58,33 @@ func (ja *JWTSessionAuthenticator) Login(
rawtoken = r.URL.Query().Get("login-token")
}
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
if t.Method == jwt.SigningMethodHS256 || t.Method == jwt.SigningMethodHS512 {
return ja.loginTokenKey, nil
}
return nil, fmt.Errorf("unkown signing method for login token: %s (known: HS256, HS512, EdDSA)", t.Method.Alg())
})
if err != nil {
log.Warn("Error while parsing jwt token")
cclog.Warn("Error while parsing jwt token")
return nil, err
}
if !token.Valid {
log.Warn("jwt token claims are not valid")
cclog.Warn("jwt token claims are not valid")
return nil, errors.New("jwt token claims are not valid")
}
claims := token.Claims.(jwt.MapClaims)
sub, _ := claims["sub"].(string)
var roles []string
projects := make([]string, 0)
// Use shared helper to get user from JWT claims
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthSession, schema.AuthViaToken)
if err != nil {
return nil, err
}
if config.Keys.JwtConfig.ValidateUser {
var err error
user, err = repository.GetUserRepository().GetUser(sub)
if err != nil && err != sql.ErrNoRows {
log.Errorf("Error while loading user '%v'", sub)
}
// Deny any logins for unknown usernames
if user == nil {
log.Warn("Could not find user from JWT in internal database.")
return nil, errors.New("unknown user")
}
} else {
var name string
if wrap, ok := claims["name"].(map[string]interface{}); ok {
if vals, ok := wrap["values"].([]interface{}); ok {
if len(vals) != 0 {
name = fmt.Sprintf("%v", vals[0])
for i := 1; i < len(vals); i++ {
name += fmt.Sprintf(" %v", vals[i])
}
}
}
}
// Extract roles from JWT (if present)
if rawroles, ok := claims["roles"].([]interface{}); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
if schema.IsValidRole(r) {
roles = append(roles, r)
}
}
}
}
if rawprojs, ok := claims["projects"].([]interface{}); ok {
for _, pp := range rawprojs {
if p, ok := pp.(string); ok {
projects = append(projects, p)
}
}
} else if rawprojs, ok := claims["projects"]; ok {
projects = append(projects, rawprojs.([]string)...)
}
user = &schema.User{
Username: sub,
Name: name,
Roles: roles,
Projects: projects,
AuthType: schema.AuthSession,
AuthSource: schema.AuthViaToken,
}
if config.Keys.JwtConfig.SyncUserOnLogin || config.Keys.JwtConfig.UpdateUserOnLogin {
handleTokenUser(user)
}
// Sync or update user if configured
if !Keys.JwtConfig.ValidateUser && (Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin) {
handleTokenUser(user)
}
return user, nil

View File

@@ -1,26 +1,44 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"errors"
"fmt"
"net"
"net/http"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-ldap/ldap/v3"
)
type LdapConfig struct {
URL string `json:"url"`
UserBase string `json:"user-base"`
SearchDN string `json:"search-dn"`
UserBind string `json:"user-bind"`
UserFilter string `json:"user-filter"`
UserAttr string `json:"username-attr"`
UIDAttr string `json:"uid-attr"`
SyncInterval string `json:"sync-interval"` // Parsed using time.ParseDuration.
SyncDelOldUsers bool `json:"sync-del-old-users"`
// Should a non-existent user be added to the DB if user exists in ldap directory
SyncUserOnLogin bool `json:"sync-user-on-login"`
UpdateUserOnLogin bool `json:"update-user-on-login"`
}
type LdapAuthenticator struct {
syncPassword string
UserAttr string
UIDAttr string
}
var _ Authenticator = (*LdapAuthenticator)(nil)
@@ -28,17 +46,21 @@ var _ Authenticator = (*LdapAuthenticator)(nil)
func (la *LdapAuthenticator) Init() error {
la.syncPassword = os.Getenv("LDAP_ADMIN_PASSWORD")
if la.syncPassword == "" {
log.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
cclog.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
}
lc := config.Keys.LdapConfig
if lc.UserAttr != "" {
la.UserAttr = lc.UserAttr
if Keys.LdapConfig.UserAttr != "" {
la.UserAttr = Keys.LdapConfig.UserAttr
} else {
la.UserAttr = "gecos"
}
if Keys.LdapConfig.UIDAttr != "" {
la.UIDAttr = Keys.LdapConfig.UIDAttr
} else {
la.UIDAttr = "uid"
}
return nil
}
@@ -48,60 +70,50 @@ func (la *LdapAuthenticator) CanLogin(
rw http.ResponseWriter,
r *http.Request,
) (*schema.User, bool) {
lc := config.Keys.LdapConfig
lc := Keys.LdapConfig
if user != nil {
if user.AuthSource == schema.AuthViaLDAP {
return user, true
}
} else {
if lc.SyncUserOnLogin {
l, err := la.getLdapConnection(true)
if err != nil {
log.Error("LDAP connection error")
}
defer l.Close()
// Search for the given username
searchRequest := ldap.NewSearchRequest(
lc.UserBase,
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
fmt.Sprintf("(&%s(uid=%s))", lc.UserFilter, username),
[]string{"dn", "uid", la.UserAttr}, nil)
sr, err := l.Search(searchRequest)
if err != nil {
log.Warn(err)
return nil, false
}
if len(sr.Entries) != 1 {
log.Warn("LDAP: User does not exist or too many entries returned")
return nil, false
}
entry := sr.Entries[0]
name := entry.GetAttributeValue(la.UserAttr)
var roles []string
roles = append(roles, schema.GetRoleString(schema.RoleUser))
projects := make([]string, 0)
user = &schema.User{
Username: username,
Name: name,
Roles: roles,
Projects: projects,
AuthType: schema.AuthSession,
AuthSource: schema.AuthViaLDAP,
}
if err := repository.GetUserRepository().AddUser(user); err != nil {
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
return nil, false
}
return user, true
} else if lc.SyncUserOnLogin {
l, err := la.getLdapConnection(true)
if err != nil {
cclog.Error("LDAP connection error")
return nil, false
}
defer l.Close()
// Search for the given username
searchRequest := ldap.NewSearchRequest(
lc.UserBase,
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UIDAttr, ldap.EscapeFilter(username)),
[]string{"dn", la.UIDAttr, la.UserAttr}, nil)
sr, err := l.Search(searchRequest)
if err != nil {
cclog.Warn(err)
return nil, false
}
if len(sr.Entries) != 1 {
cclog.Warn("LDAP: User does not exist or too many entries returned")
return nil, false
}
entry := sr.Entries[0]
user = &schema.User{
Username: username,
Name: entry.GetAttributeValue(la.UserAttr),
Roles: []string{schema.GetRoleString(schema.RoleUser)},
Projects: make([]string, 0),
AuthType: schema.AuthSession,
AuthSource: schema.AuthViaLDAP,
}
handleLdapUser(user)
return user, true
}
return nil, false
@@ -114,14 +126,14 @@ func (la *LdapAuthenticator) Login(
) (*schema.User, error) {
l, err := la.getLdapConnection(false)
if err != nil {
log.Warn("Error while getting ldap connection")
cclog.Warn("Error while getting ldap connection")
return nil, err
}
defer l.Close()
userDn := strings.Replace(config.Keys.LdapConfig.UserBind, "{username}", user.Username, -1)
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", ldap.EscapeDN(user.Username))
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
log.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
user.Username, err)
return nil, fmt.Errorf("Authentication failed")
}
@@ -130,11 +142,11 @@ func (la *LdapAuthenticator) Login(
}
func (la *LdapAuthenticator) Sync() error {
const IN_DB int = 1
const IN_LDAP int = 2
const IN_BOTH int = 3
const InDB int = 1
const InLdap int = 2
const InBoth int = 3
ur := repository.GetUserRepository()
lc := config.Keys.LdapConfig
lc := Keys.LdapConfig
users := map[string]int{}
usernames, err := ur.GetLdapUsernames()
@@ -143,12 +155,12 @@ func (la *LdapAuthenticator) Sync() error {
}
for _, username := range usernames {
users[username] = IN_DB
users[username] = InDB
}
l, err := la.getLdapConnection(true)
if err != nil {
log.Error("LDAP connection error")
cclog.Error("LDAP connection error")
return err
}
defer l.Close()
@@ -157,50 +169,49 @@ func (la *LdapAuthenticator) Sync() error {
lc.UserBase,
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
lc.UserFilter,
[]string{"dn", "uid", la.UserAttr}, nil))
[]string{"dn", la.UIDAttr, la.UserAttr}, nil))
if err != nil {
log.Warn("LDAP search error")
cclog.Warn("LDAP search error")
return err
}
newnames := map[string]string{}
for _, entry := range ldapResults.Entries {
username := entry.GetAttributeValue("uid")
username := entry.GetAttributeValue(la.UIDAttr)
if username == "" {
return errors.New("no attribute 'uid'")
return fmt.Errorf("no attribute '%s'", la.UIDAttr)
}
_, ok := users[username]
if !ok {
users[username] = IN_LDAP
users[username] = InLdap
newnames[username] = entry.GetAttributeValue(la.UserAttr)
} else {
users[username] = IN_BOTH
users[username] = InBoth
}
}
for username, where := range users {
if where == IN_DB && lc.SyncDelOldUsers {
ur.DelUser(username)
log.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
} else if where == IN_LDAP {
if where == InDB && lc.SyncDelOldUsers {
if err := ur.DelUser(username); err != nil {
cclog.Errorf("User '%s' LDAP: Delete from DB failed: %v", username, err)
return err
}
cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
} else if where == InLdap {
name := newnames[username]
var roles []string
roles = append(roles, schema.GetRoleString(schema.RoleUser))
projects := make([]string, 0)
user := &schema.User{
Username: username,
Name: name,
Roles: roles,
Projects: projects,
Roles: []string{schema.GetRoleString(schema.RoleUser)},
Projects: make([]string, 0),
AuthSource: schema.AuthViaLDAP,
}
log.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
cclog.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
if err := ur.AddUser(user); err != nil {
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
return err
}
}
@@ -210,17 +221,19 @@ func (la *LdapAuthenticator) Sync() error {
}
func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
lc := config.Keys.LdapConfig
conn, err := ldap.DialURL(lc.Url)
lc := Keys.LdapConfig
conn, err := ldap.DialURL(lc.URL,
ldap.DialWithDialer(&net.Dialer{Timeout: 10 * time.Second}))
if err != nil {
log.Warn("LDAP URL dial failed")
cclog.Warn("LDAP URL dial failed")
return nil, err
}
conn.SetTimeout(30 * time.Second)
if admin {
if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {
conn.Close()
log.Warn("LDAP connection bind failed")
cclog.Warn("LDAP connection bind failed")
return nil, err
}
}

View File

@@ -1,15 +1,16 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"fmt"
"net/http"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"golang.org/x/crypto/bcrypt"
)
@@ -27,19 +28,19 @@ func (la *LocalAuthenticator) CanLogin(
user *schema.User,
username string,
rw http.ResponseWriter,
r *http.Request) (*schema.User, bool) {
r *http.Request,
) (*schema.User, bool) {
return user, user != nil && user.AuthSource == schema.AuthViaLocalPassword
}
func (la *LocalAuthenticator) Login(
user *schema.User,
rw http.ResponseWriter,
r *http.Request) (*schema.User, error) {
r *http.Request,
) (*schema.User, error) {
if e := bcrypt.CompareHashAndPassword([]byte(user.Password),
[]byte(r.FormValue("password"))); e != nil {
log.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
cclog.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
return nil, fmt.Errorf("Authentication failed")
}

View File

@@ -1,27 +1,34 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
import (
"context"
"crypto/rand"
"encoding/base64"
"fmt"
"io"
"net/http"
"os"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/coreos/go-oidc/v3/oidc"
"github.com/gorilla/mux"
"github.com/go-chi/chi/v5"
"golang.org/x/oauth2"
)
type OpenIDConfig struct {
Provider string `json:"provider"`
SyncUserOnLogin bool `json:"sync-user-on-login"`
UpdateUserOnLogin bool `json:"update-user-on-login"`
}
type OIDC struct {
client *oauth2.Config
provider *oidc.Provider
@@ -44,30 +51,35 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin
MaxAge: int(time.Hour.Seconds()),
Secure: r.TLS != nil,
HttpOnly: true,
SameSite: http.SameSiteLaxMode,
}
http.SetCookie(w, c)
}
// NewOIDC creates a new OIDC authenticator with the configured provider
func NewOIDC(a *Authentication) *OIDC {
provider, err := oidc.NewProvider(context.Background(), config.Keys.OpenIDConfig.Provider)
// Use context with timeout for provider initialization
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
provider, err := oidc.NewProvider(ctx, Keys.OpenIDConfig.Provider)
if err != nil {
log.Fatal(err)
cclog.Fatal(err)
}
clientID := os.Getenv("OID_CLIENT_ID")
if clientID == "" {
log.Warn("environment variable 'OID_CLIENT_ID' not set (Open ID connect auth will not work)")
cclog.Warn("environment variable 'OID_CLIENT_ID' not set (Open ID connect auth will not work)")
}
clientSecret := os.Getenv("OID_CLIENT_SECRET")
if clientSecret == "" {
log.Warn("environment variable 'OID_CLIENT_SECRET' not set (Open ID connect auth will not work)")
cclog.Warn("environment variable 'OID_CLIENT_SECRET' not set (Open ID connect auth will not work)")
}
client := &oauth2.Config{
ClientID: clientID,
ClientSecret: clientSecret,
Endpoint: provider.Endpoint(),
RedirectURL: "oidc-callback",
Scopes: []string{oidc.ScopeOpenID, "profile", "email"},
Scopes: []string{oidc.ScopeOpenID, "profile"},
}
oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a}
@@ -75,7 +87,7 @@ func NewOIDC(a *Authentication) *OIDC {
return oa
}
func (oa *OIDC) RegisterEndpoints(r *mux.Router) {
func (oa *OIDC) RegisterEndpoints(r chi.Router) {
r.HandleFunc("/oidc-login", oa.OAuth2Login)
r.HandleFunc("/oidc-callback", oa.OAuth2Callback)
}
@@ -105,55 +117,99 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
http.Error(rw, "Code not found", http.StatusBadRequest)
return
}
token, err := oa.client.Exchange(context.Background(), code, oauth2.VerifierOption(codeVerifier))
// Exchange authorization code for token with timeout
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier))
if err != nil {
http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError)
cclog.Errorf("token exchange failed: %s", err.Error())
http.Error(rw, "Authentication failed during token exchange", http.StatusInternalServerError)
return
}
userInfo, err := oa.provider.UserInfo(context.Background(), oauth2.StaticTokenSource(token))
// Get user info from OIDC provider with same timeout
userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token))
if err != nil {
http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError)
cclog.Errorf("failed to get userinfo: %s", err.Error())
http.Error(rw, "Failed to retrieve user information", http.StatusInternalServerError)
return
}
// // Extract the ID Token from OAuth2 token.
// rawIDToken, ok := token.Extra("id_token").(string)
// if !ok {
// http.Error(rw, "Cannot access idToken", http.StatusInternalServerError)
// }
//
// verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
// // Parse and verify ID Token payload.
// idToken, err := verifier.Verify(context.Background(), rawIDToken)
// if err != nil {
// http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError)
// }
// Verify ID token and nonce to prevent replay attacks
rawIDToken, ok := token.Extra("id_token").(string)
if !ok {
http.Error(rw, "ID token not found in response", http.StatusInternalServerError)
return
}
nonceCookie, err := r.Cookie("nonce")
if err != nil {
http.Error(rw, "nonce cookie not found", http.StatusBadRequest)
return
}
verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
idToken, err := verifier.Verify(ctx, rawIDToken)
if err != nil {
cclog.Errorf("ID token verification failed: %s", err.Error())
http.Error(rw, "ID token verification failed", http.StatusInternalServerError)
return
}
if idToken.Nonce != nonceCookie.Value {
http.Error(rw, "Nonce mismatch", http.StatusBadRequest)
return
}
projects := make([]string, 0)
// Extract custom claims
// Extract custom claims from userinfo
var claims struct {
Username string `json:"preferred_username"`
Name string `json:"name"`
Profile struct {
// Keycloak realm-level roles
RealmAccess struct {
Roles []string `json:"roles"`
} `json:"realm_access"`
// Keycloak client-level roles
ResourceAccess struct {
Client struct {
Roles []string `json:"roles"`
} `json:"clustercockpit"`
} `json:"resource_access"`
}
if err := userInfo.Claims(&claims); err != nil {
http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError)
cclog.Errorf("failed to extract claims: %s", err.Error())
http.Error(rw, "Failed to extract user claims", http.StatusInternalServerError)
return
}
if claims.Username == "" {
http.Error(rw, "Username claim missing from OIDC provider", http.StatusBadRequest)
return
}
// Merge roles from both client-level and realm-level access
oidcRoles := append(claims.ResourceAccess.Client.Roles, claims.RealmAccess.Roles...)
roleSet := make(map[string]bool)
for _, r := range oidcRoles {
switch r {
case "user":
roleSet[schema.GetRoleString(schema.RoleUser)] = true
case "admin":
roleSet[schema.GetRoleString(schema.RoleAdmin)] = true
case "manager":
roleSet[schema.GetRoleString(schema.RoleManager)] = true
case "support":
roleSet[schema.GetRoleString(schema.RoleSupport)] = true
}
}
var roles []string
for _, r := range claims.Profile.Client.Roles {
switch r {
case "user":
roles = append(roles, schema.GetRoleString(schema.RoleUser))
case "admin":
roles = append(roles, schema.GetRoleString(schema.RoleAdmin))
}
for role := range roleSet {
roles = append(roles, role)
}
if len(roles) == 0 {
@@ -168,14 +224,18 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
AuthSource: schema.AuthViaOIDC,
}
if config.Keys.OpenIDConfig.SyncUserOnLogin || config.Keys.OpenIDConfig.UpdateUserOnLogin {
if Keys.OpenIDConfig.SyncUserOnLogin || Keys.OpenIDConfig.UpdateUserOnLogin {
handleOIDCUser(user)
}
oa.authentication.SaveSession(rw, r, user)
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(ctx))
if err := oa.authentication.SaveSession(rw, r, user); err != nil {
cclog.Errorf("session save failed for user %q: %s", user.Username, err.Error())
http.Error(rw, "Failed to create session", http.StatusInternalServerError)
return
}
cclog.Infof("login successful: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user)
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx))
}
func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
@@ -190,7 +250,24 @@ func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
codeVerifier := oauth2.GenerateVerifier()
setCallbackCookie(rw, r, "verifier", codeVerifier)
// Generate nonce for ID token replay protection
nonce, err := randString(16)
if err != nil {
http.Error(rw, "Internal error", http.StatusInternalServerError)
return
}
setCallbackCookie(rw, r, "nonce", nonce)
// Build redirect URL from the incoming request
scheme := "https"
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
scheme = "http"
}
oa.client.RedirectURL = fmt.Sprintf("%s://%s/oidc-callback", scheme, r.Host)
// Redirect user to consent page to ask for permission
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier))
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline,
oauth2.S256ChallengeOption(codeVerifier),
oidc.Nonce(nonce))
http.Redirect(rw, r, url, http.StatusFound)
}

111
internal/auth/schema.go Normal file
View File

@@ -0,0 +1,111 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package auth
var configSchema = `
{
"jwts": {
"description": "For JWT token authentication.",
"type": "object",
"properties": {
"max-age": {
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
"type": "string"
},
"cookie-name": {
"description": "Cookie that should be checked for a JWT token.",
"type": "string"
},
"validate-user": {
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
"type": "boolean"
},
"trusted-issuer": {
"description": "Issuer that should be accepted when validating external JWTs ",
"type": "string"
},
"sync-user-on-login": {
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
"type": "boolean"
},
"update-user-on-login": {
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided in JWT.",
"type": "boolean"
}
},
"required": ["max-age"]
},
"oidc": {
"type": "object",
"properties": {
"provider": {
"description": "OpenID Connect provider URL.",
"type": "string"
},
"sync-user-on-login": {
"description": "Add non-existent user to DB at login attempt with values provided.",
"type": "boolean"
},
"update-user-on-login": {
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided.",
"type": "boolean"
}
},
"required": ["provider"]
},
"ldap": {
"description": "For LDAP Authentication and user synchronisation.",
"type": "object",
"properties": {
"url": {
"description": "URL of LDAP directory server.",
"type": "string"
},
"user-base": {
"description": "Base DN of user tree root.",
"type": "string"
},
"search-dn": {
"description": "DN for authenticating LDAP admin account with general read rights.",
"type": "string"
},
"user-bind": {
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
"type": "string"
},
"user-filter": {
"description": "Filter to extract users for syncing.",
"type": "string"
},
"username-attr": {
"description": "Attribute with full username. Default: gecos",
"type": "string"
},
"sync-interval": {
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
"type": "string"
},
"sync-del-old-users": {
"description": "Delete obsolete users in database.",
"type": "boolean"
},
"uid-attr": {
"description": "LDAP attribute used as login username. Default: uid",
"type": "string"
},
"sync-user-on-login": {
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
"type": "boolean"
},
"update-user-on-login": {
"description": "Should an existent user attributes in the DB be updated at login attempt with values from LDAP.",
"type": "boolean"
}
},
"required": ["url", "user-base", "search-dn", "user-bind", "user-filter"]
},
"required": ["jwts"]
}`

View File

@@ -1,72 +1,147 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package config implements the program configuration data structures, validation and parsing
package config
import (
"bytes"
"encoding/json"
"log"
"os"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/resampler"
)
var Keys schema.ProgramConfig = schema.ProgramConfig{
type ProgramConfig struct {
// Address where the http (or https) server will listen on (for example: 'localhost:80').
Addr string `json:"addr"`
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
APIAllowedIPs []string `json:"api-allowed-ips"`
APISubjects *NATSConfig `json:"api-subjects"`
// Drop root permissions once .env was read and the port was taken.
User string `json:"user"`
Group string `json:"group"`
// Disable authentication (for everything: API, Web-UI, ...)
DisableAuthentication bool `json:"disable-authentication"`
// If `embed-static-files` is true (default), the frontend files are directly
// embeded into the go binary and expected to be in web/frontend. Only if
// it is false the files in `static-files` are served instead.
EmbedStaticFiles bool `json:"embed-static-files"`
StaticFiles string `json:"static-files"`
// Path to SQLite database file
DB string `json:"db"`
EnableJobTaggers bool `json:"enable-job-taggers"`
// Validate json input against schema
Validate bool `json:"validate"`
// If 0 or empty, the session does not expire!
SessionMaxAge string `json:"session-max-age"`
// If both those options are not empty, use HTTPS using those certificates.
HTTPSCertFile string `json:"https-cert-file"`
HTTPSKeyFile string `json:"https-key-file"`
// If not the empty string and `addr` does not end in ":80",
// redirect every request incoming at port 80 to that url.
RedirectHTTPTo string `json:"redirect-http-to"`
// Where to store MachineState files
MachineStateDir string `json:"machine-state-dir"`
// If not zero, automatically mark jobs as stopped running X seconds longer than their walltime.
StopJobsExceedingWalltime int `json:"stop-jobs-exceeding-walltime"`
// Defines time X in seconds in which jobs are considered to be "short" and will be filtered in specific views.
ShortRunningJobsDuration int `json:"short-running-jobs-duration"`
// Energy Mix CO2 Emission Constant [g/kWh]
// If entered, displays estimated CO2 emission for job based on jobs totalEnergy
EmissionConstant int `json:"emission-constant"`
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"resampling"`
// Systemd unit name for log viewer (default: "clustercockpit")
SystemdUnit string `json:"systemd-unit"`
// Node state retention configuration
NodeStateRetention *NodeStateRetention `json:"nodestate-retention"`
}
type NodeStateRetention struct {
Policy string `json:"policy"` // "delete" or "move"
Age int `json:"age"` // hours, default 24
TargetKind string `json:"target-kind"` // "file" or "s3"
TargetPath string `json:"target-path"`
TargetEndpoint string `json:"target-endpoint"`
TargetBucket string `json:"target-bucket"`
TargetAccessKey string `json:"target-access-key"`
TargetSecretKey string `json:"target-secret-key"`
TargetRegion string `json:"target-region"`
TargetUsePathStyle bool `json:"target-use-path-style"`
MaxFileSizeMB int `json:"max-file-size-mb"`
}
type ResampleConfig struct {
// Minimum number of points to trigger resampling of data
MinimumPoints int `json:"minimum-points"`
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
Resolutions []int `json:"resolutions"`
// Trigger next zoom level at less than this many visible datapoints
Trigger int `json:"trigger"`
}
type NATSConfig struct {
SubjectJobEvent string `json:"subject-job-event"`
SubjectNodeState string `json:"subject-node-state"`
}
type IntRange struct {
From int `json:"from"`
To int `json:"to"`
}
type TimeRange struct {
From *time.Time `json:"from"`
To *time.Time `json:"to"`
Range string `json:"range,omitempty"`
}
type FilterRanges struct {
Duration *IntRange `json:"duration"`
NumNodes *IntRange `json:"num-nodes"`
StartTime *TimeRange `json:"start-time"`
}
var Keys ProgramConfig = ProgramConfig{
Addr: "localhost:8080",
DisableAuthentication: false,
EmbedStaticFiles: true,
DBDriver: "sqlite3",
DB: "./var/job.db",
Archive: json.RawMessage(`{\"kind\":\"file\",\"path\":\"./var/job-archive\"}`),
DisableArchive: false,
Validate: false,
SessionMaxAge: "168h",
StopJobsExceedingWalltime: 0,
ShortRunningJobsDuration: 5 * 60,
UiDefaults: map[string]interface{}{
"analysis_view_histogramMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"analysis_view_scatterPlotMetrics": [][]string{{"flops_any", "mem_bw"}, {"flops_any", "cpu_load"}, {"cpu_load", "mem_bw"}},
"job_view_nodestats_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"job_view_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"job_view_showFootprint": true,
"job_list_usePaging": false,
"plot_general_colorBackground": true,
"plot_general_colorscheme": []string{"#00bfff", "#0000ff", "#ff00ff", "#ff0000", "#ff8000", "#ffff00", "#80ff00"},
"plot_general_lineWidth": 3,
"plot_list_jobsPerPage": 50,
"plot_list_selectedMetrics": []string{"cpu_load", "mem_used", "flops_any", "mem_bw"},
"plot_view_plotsPerRow": 3,
"plot_view_showPolarplot": true,
"plot_view_showRoofline": true,
"plot_view_showStatTable": true,
"system_view_selectedMetric": "cpu_load",
"analysis_view_selectedTopEntity": "user",
"analysis_view_selectedTopCategory": "totalWalltime",
"status_view_selectedTopUserCategory": "totalJobs",
"status_view_selectedTopProjectCategory": "totalJobs",
},
}
func Init(flagConfigFile string) {
raw, err := os.ReadFile(flagConfigFile)
if err != nil {
if !os.IsNotExist(err) {
log.Fatalf("CONFIG ERROR: %v", err)
}
} else {
if err := schema.Validate(schema.Config, bytes.NewReader(raw)); err != nil {
log.Fatalf("Validate config: %v\n", err)
}
dec := json.NewDecoder(bytes.NewReader(raw))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
log.Fatalf("could not decode: %v", err)
}
func Init(mainConfig json.RawMessage) {
Validate(configSchema, mainConfig)
dec := json.NewDecoder(bytes.NewReader(mainConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
}
if Keys.Clusters == nil || len(Keys.Clusters) < 1 {
log.Fatal("At least one cluster required in config!")
}
if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
}
}

View File

@@ -1,16 +1,26 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package config
import (
"testing"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
func TestInit(t *testing.T) {
fp := "../../configs/config.json"
Init(fp)
ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
if Keys.Addr != "0.0.0.0:443" {
t.Errorf("wrong addr\ngot: %s \nwant: 0.0.0.0:443", Keys.Addr)
}
@@ -18,7 +28,13 @@ func TestInit(t *testing.T) {
func TestInitMinimal(t *testing.T) {
fp := "../../configs/config-demo.json"
Init(fp)
ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
if Keys.Addr != "127.0.0.1:8080" {
t.Errorf("wrong addr\ngot: %s \nwant: 127.0.0.1:8080", Keys.Addr)
}

View File

@@ -1,3 +1,8 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package config
import (
@@ -6,9 +11,11 @@ import (
"strings"
)
// DEPRECATED: SUPERSEDED BY NEW USER CONFIG - userConfig.go / web.go
type DefaultMetricsCluster struct {
Name string `json:"name"`
DefaultMetrics string `json:"default_metrics"`
DefaultMetrics string `json:"default-metrics"`
}
type DefaultMetricsConfig struct {

182
internal/config/schema.go Normal file
View File

@@ -0,0 +1,182 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package config
var configSchema = `
{
"type": "object",
"properties": {
"addr": {
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
"type": "string"
},
"api-allowed-ips": {
"description": "Addresses from which secured API endpoints can be reached",
"type": "array",
"items": {
"type": "string"
}
},
"user": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
},
"group": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
},
"disable-authentication": {
"description": "Disable authentication (for everything: API, Web-UI, ...).",
"type": "boolean"
},
"embed-static-files": {
"description": "If all files in web/frontend/public should be served from within the binary itself (they are embedded) or not.",
"type": "boolean"
},
"static-files": {
"description": "Folder where static assets can be found, if embed-static-files is false.",
"type": "string"
},
"db": {
"description": "Path to SQLite database file (e.g., './var/job.db')",
"type": "string"
},
"enable-job-taggers": {
"description": "Turn on automatic application and jobclass taggers",
"type": "boolean"
},
"validate": {
"description": "Validate all input json documents against json schema.",
"type": "boolean"
},
"session-max-age": {
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
"type": "string"
},
"https-cert-file": {
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
"type": "string"
},
"https-key-file": {
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
"type": "string"
},
"redirect-http-to": {
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
"type": "string"
},
"stop-jobs-exceeding-walltime": {
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
"type": "integer"
},
"short-running-jobs-duration": {
"description": "Do not show running jobs shorter than X seconds.",
"type": "integer"
},
"emission-constant": {
"description": "Energy mix CO2 emission constant [g/kWh]. If set, displays estimated CO2 emission for jobs.",
"type": "integer"
},
"machine-state-dir": {
"description": "Where to store MachineState files.",
"type": "string"
},
"systemd-unit": {
"description": "Systemd unit name for log viewer (default: 'clustercockpit').",
"type": "string"
},
"resampling": {
"description": "Enable dynamic zoom in frontend metric plots.",
"type": "object",
"properties": {
"minimum-points": {
"description": "Minimum points to trigger resampling of time-series data.",
"type": "integer"
},
"trigger": {
"description": "Trigger next zoom level at less than this many visible datapoints.",
"type": "integer"
},
"resolutions": {
"description": "Array of resampling target resolutions, in seconds.",
"type": "array",
"items": {
"type": "integer"
}
}
},
"required": ["trigger", "resolutions"]
},
"api-subjects": {
"description": "NATS subjects configuration for subscribing to job and node events.",
"type": "object",
"properties": {
"subject-job-event": {
"description": "NATS subject for job events (start_job, stop_job)",
"type": "string"
},
"subject-node-state": {
"description": "NATS subject for node state updates",
"type": "string"
}
},
"required": ["subject-job-event", "subject-node-state"]
},
"nodestate-retention": {
"description": "Node state retention configuration for cleaning up old node_state rows.",
"type": "object",
"properties": {
"policy": {
"description": "Retention policy: 'delete' to remove old rows, 'move' to archive to Parquet then delete.",
"type": "string",
"enum": ["delete", "move"]
},
"age": {
"description": "Retention age in hours (default: 24).",
"type": "integer"
},
"target-kind": {
"description": "Target kind for parquet archiving: 'file' or 's3'.",
"type": "string",
"enum": ["file", "s3"]
},
"target-path": {
"description": "Filesystem path for parquet file target.",
"type": "string"
},
"target-endpoint": {
"description": "S3 endpoint URL.",
"type": "string"
},
"target-bucket": {
"description": "S3 bucket name.",
"type": "string"
},
"target-access-key": {
"description": "S3 access key.",
"type": "string"
},
"target-secret-key": {
"description": "S3 secret key.",
"type": "string"
},
"target-region": {
"description": "S3 region.",
"type": "string"
},
"target-use-path-style": {
"description": "Use path-style S3 addressing.",
"type": "boolean"
},
"max-file-size-mb": {
"description": "Maximum parquet file size in MB (default: 128).",
"type": "integer"
}
},
"required": ["policy"]
}
}
}`

View File

@@ -0,0 +1,29 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package config
import (
"encoding/json"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/santhosh-tekuri/jsonschema/v5"
)
func Validate(schema string, instance json.RawMessage) {
sch, err := jsonschema.CompileString("schema.json", schema)
if err != nil {
cclog.Fatalf("%#v", err)
}
var v any
if err := json.Unmarshal([]byte(instance), &v); err != nil {
cclog.Fatal(err)
}
if err = sch.Validate(v); err != nil {
cclog.Fatalf("%#v", err)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,6 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package model

View File

@@ -3,14 +3,28 @@
package model
import (
"bytes"
"fmt"
"io"
"strconv"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type ClusterMetricWithName struct {
Name string `json:"name"`
Unit *schema.Unit `json:"unit,omitempty"`
Timestep int `json:"timestep"`
Data []schema.Float `json:"data"`
}
type ClusterMetrics struct {
NodeCount int `json:"nodeCount"`
Metrics []*ClusterMetricWithName `json:"metrics"`
}
type Count struct {
Name string `json:"name"`
Count int `json:"count"`
@@ -50,23 +64,26 @@ type IntRangeOutput struct {
type JobFilter struct {
Tags []string `json:"tags,omitempty"`
DbID []string `json:"dbId,omitempty"`
JobID *StringInput `json:"jobId,omitempty"`
ArrayJobID *int `json:"arrayJobId,omitempty"`
User *StringInput `json:"user,omitempty"`
Project *StringInput `json:"project,omitempty"`
JobName *StringInput `json:"jobName,omitempty"`
Cluster *StringInput `json:"cluster,omitempty"`
SubCluster *StringInput `json:"subCluster,omitempty"`
Partition *StringInput `json:"partition,omitempty"`
Duration *schema.IntRange `json:"duration,omitempty"`
Duration *config.IntRange `json:"duration,omitempty"`
Energy *FloatRange `json:"energy,omitempty"`
MinRunningFor *int `json:"minRunningFor,omitempty"`
NumNodes *schema.IntRange `json:"numNodes,omitempty"`
NumAccelerators *schema.IntRange `json:"numAccelerators,omitempty"`
NumHWThreads *schema.IntRange `json:"numHWThreads,omitempty"`
StartTime *schema.TimeRange `json:"startTime,omitempty"`
NumNodes *config.IntRange `json:"numNodes,omitempty"`
NumAccelerators *config.IntRange `json:"numAccelerators,omitempty"`
NumHWThreads *config.IntRange `json:"numHWThreads,omitempty"`
StartTime *config.TimeRange `json:"startTime,omitempty"`
State []schema.JobState `json:"state,omitempty"`
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
Exclusive *int `json:"exclusive,omitempty"`
Shared *string `json:"shared,omitempty"`
Schedule *string `json:"schedule,omitempty"`
Node *StringInput `json:"node,omitempty"`
}
@@ -81,11 +98,6 @@ type JobLinkResultList struct {
Count *int `json:"count,omitempty"`
}
type JobMetricStatWithName struct {
Name string `json:"name"`
Stats *schema.MetricStatistics `json:"stats"`
}
type JobMetricWithName struct {
Name string `json:"name"`
Scope schema.MetricScope `json:"scope"`
@@ -100,9 +112,23 @@ type JobResultList struct {
HasNextPage *bool `json:"hasNextPage,omitempty"`
}
type JobStats struct {
ID int `json:"id"`
JobID string `json:"jobId"`
StartTime int `json:"startTime"`
Duration int `json:"duration"`
Cluster string `json:"cluster"`
SubCluster string `json:"subCluster"`
NumNodes int `json:"numNodes"`
NumHWThreads *int `json:"numHWThreads,omitempty"`
NumAccelerators *int `json:"numAccelerators,omitempty"`
Stats []*NamedStats `json:"stats"`
}
type JobsStatistics struct {
ID string `json:"id"`
Name string `json:"name"`
TotalUsers int `json:"totalUsers"`
TotalJobs int `json:"totalJobs"`
RunningJobs int `json:"runningJobs"`
ShortJobs int `json:"shortJobs"`
@@ -147,12 +173,49 @@ type MetricStatItem struct {
type Mutation struct {
}
type NamedStats struct {
Name string `json:"name"`
Data *schema.MetricStatistics `json:"data"`
}
type NamedStatsWithScope struct {
Name string `json:"name"`
Scope schema.MetricScope `json:"scope"`
Stats []*ScopedStats `json:"stats"`
}
type NodeFilter struct {
Hostname *StringInput `json:"hostname,omitempty"`
Cluster *StringInput `json:"cluster,omitempty"`
SubCluster *StringInput `json:"subCluster,omitempty"`
SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"`
HealthState *string `json:"healthState,omitempty"`
TimeStart *int `json:"timeStart,omitempty"`
}
type NodeMetrics struct {
Host string `json:"host"`
State string `json:"state"`
SubCluster string `json:"subCluster"`
Metrics []*JobMetricWithName `json:"metrics"`
}
type NodeStateResultList struct {
Items []*schema.Node `json:"items"`
Count *int `json:"count,omitempty"`
}
type NodeStates struct {
State string `json:"state"`
Count int `json:"count"`
}
type NodeStatesTimed struct {
State string `json:"state"`
Counts []int `json:"counts"`
Times []int `json:"times"`
}
type NodesResultList struct {
Items []*NodeMetrics `json:"items"`
Offset *int `json:"offset,omitempty"`
@@ -173,6 +236,12 @@ type PageRequest struct {
Page int `json:"page"`
}
type ScopedStats struct {
Hostname string `json:"hostname"`
ID *string `json:"id,omitempty"`
Data *schema.MetricStatistics `json:"data"`
}
type StringInput struct {
Eq *string `json:"eq,omitempty"`
Neq *string `json:"neq,omitempty"`
@@ -203,20 +272,22 @@ type User struct {
type Aggregate string
const (
AggregateUser Aggregate = "USER"
AggregateProject Aggregate = "PROJECT"
AggregateCluster Aggregate = "CLUSTER"
AggregateUser Aggregate = "USER"
AggregateProject Aggregate = "PROJECT"
AggregateCluster Aggregate = "CLUSTER"
AggregateSubcluster Aggregate = "SUBCLUSTER"
)
var AllAggregate = []Aggregate{
AggregateUser,
AggregateProject,
AggregateCluster,
AggregateSubcluster,
}
func (e Aggregate) IsValid() bool {
switch e {
case AggregateUser, AggregateProject, AggregateCluster:
case AggregateUser, AggregateProject, AggregateCluster, AggregateSubcluster:
return true
}
return false
@@ -243,11 +314,26 @@ func (e Aggregate) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
func (e *Aggregate) UnmarshalJSON(b []byte) error {
s, err := strconv.Unquote(string(b))
if err != nil {
return err
}
return e.UnmarshalGQL(s)
}
func (e Aggregate) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
e.MarshalGQL(&buf)
return buf.Bytes(), nil
}
type SortByAggregate string
const (
SortByAggregateTotalwalltime SortByAggregate = "TOTALWALLTIME"
SortByAggregateTotaljobs SortByAggregate = "TOTALJOBS"
SortByAggregateTotalusers SortByAggregate = "TOTALUSERS"
SortByAggregateTotalnodes SortByAggregate = "TOTALNODES"
SortByAggregateTotalnodehours SortByAggregate = "TOTALNODEHOURS"
SortByAggregateTotalcores SortByAggregate = "TOTALCORES"
@@ -259,6 +345,7 @@ const (
var AllSortByAggregate = []SortByAggregate{
SortByAggregateTotalwalltime,
SortByAggregateTotaljobs,
SortByAggregateTotalusers,
SortByAggregateTotalnodes,
SortByAggregateTotalnodehours,
SortByAggregateTotalcores,
@@ -269,7 +356,7 @@ var AllSortByAggregate = []SortByAggregate{
func (e SortByAggregate) IsValid() bool {
switch e {
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalusers, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
return true
}
return false
@@ -296,6 +383,20 @@ func (e SortByAggregate) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
func (e *SortByAggregate) UnmarshalJSON(b []byte) error {
s, err := strconv.Unquote(string(b))
if err != nil {
return err
}
return e.UnmarshalGQL(s)
}
func (e SortByAggregate) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
e.MarshalGQL(&buf)
return buf.Bytes(), nil
}
type SortDirectionEnum string
const (
@@ -336,3 +437,17 @@ func (e *SortDirectionEnum) UnmarshalGQL(v any) error {
func (e SortDirectionEnum) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
func (e *SortDirectionEnum) UnmarshalJSON(b []byte) error {
s, err := strconv.Unquote(string(b))
if err != nil {
return err
}
return e.UnmarshalGQL(s)
}
func (e SortDirectionEnum) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
e.MarshalGQL(&buf)
return buf.Bytes(), nil
}

View File

@@ -4,7 +4,7 @@ import (
"sync"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/jmoiron/sqlx"
)
@@ -32,7 +32,7 @@ func Init() {
func GetResolverInstance() *Resolver {
if resolverInstance == nil {
log.Fatal("Authentication module not initialized!")
cclog.Fatal("Authentication module not initialized!")
}
return resolverInstance

View File

@@ -1,13 +1,15 @@
package graph
// This file will be automatically regenerated based on the schema, any resolver implementations
// This file will be automatically regenerated based on the schema, any resolver
// implementations
// will be copied through when generating and any unknown code will be moved to the end.
// Code generated by github.com/99designs/gqlgen version v0.17.66
// Code generated by github.com/99designs/gqlgen version v0.17.85
import (
"context"
"errors"
"fmt"
"math"
"regexp"
"slices"
"strconv"
@@ -17,11 +19,12 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
ccunit "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// Partitions is the resolver for the partitions field.
@@ -29,15 +32,21 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) (
return r.Repo.Partitions(obj.Name)
}
// StartTime is the resolver for the startTime field.
func (r *jobResolver) StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) {
timestamp := time.Unix(obj.StartTime, 0)
return &timestamp, nil
}
// Tags is the resolver for the tags field.
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
return r.Repo.GetTags(repository.GetUserFromContext(ctx), &obj.ID)
return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID)
}
// ConcurrentJobs is the resolver for the concurrentJobs field.
func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) {
// FIXME: Make the hardcoded duration configurable
if obj.Exclusive != 1 && obj.Duration > 600 {
if obj.Shared != "none" && obj.Duration > 600 {
return r.Repo.FindConcurrentJobs(ctx, obj)
}
@@ -48,7 +57,7 @@ func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*mod
func (r *jobResolver) Footprint(ctx context.Context, obj *schema.Job) ([]*model.FootprintValue, error) {
rawFootprint, err := r.Repo.FetchFootprint(obj)
if err != nil {
log.Warn("Error while fetching job footprint data")
cclog.Warn("Error while fetching job footprint data")
return nil, err
}
@@ -73,21 +82,21 @@ func (r *jobResolver) Footprint(ctx context.Context, obj *schema.Job) ([]*model.
func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*model.EnergyFootprintValue, error) {
rawEnergyFootprint, err := r.Repo.FetchEnergyFootprint(obj)
if err != nil {
log.Warn("Error while fetching job energy footprint data")
cclog.Warn("Error while fetching job energy footprint data")
return nil, err
}
res := []*model.EnergyFootprintValue{}
for name, value := range rawEnergyFootprint {
// Suboptimal: Nearly hardcoded metric name expectations
matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`)
matchCPU := regexp.MustCompile(`cpu|Cpu|CPU`)
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
matchCore := regexp.MustCompile(`core|Core|CORE`)
hwType := ""
switch test := name; { // NOtice ';' for var declaration
case matchCpu.MatchString(test):
case matchCPU.MatchString(test):
hwType = "CPU"
case matchAcc.MatchString(test):
hwType = "Accelerator"
@@ -125,40 +134,75 @@ func (r *metricValueResolver) Name(ctx context.Context, obj *schema.MetricValue)
// CreateTag is the resolver for the createTag field.
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string, scope string) (*schema.Tag, error) {
id, err := r.Repo.CreateTag(typeArg, name, scope)
if err != nil {
log.Warn("Error while creating tag")
return nil, err
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && scope == "admin" ||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && scope == "global" ||
user.Username == scope {
// Create in DB
id, err := r.Repo.CreateTag(typeArg, name, scope)
if err != nil {
cclog.Warn("Error while creating tag")
return nil, err
}
return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil
} else {
cclog.Warnf("Not authorized to create tag with scope: %s", scope)
return nil, fmt.Errorf("not authorized to create tag with scope: %s", scope)
}
}
// DeleteTag is the resolver for the deleteTag field.
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
// This Uses ID string <-> ID string, removeTagFromList uses []string <-> []int
panic(fmt.Errorf("not implemented: DeleteTag - deleteTag"))
}
// AddTagsToJob is the resolver for the addTagsToJob field.
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
// Selectable Tags Pre-Filtered by Scope in Frontend: No backend check required
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
jid, err := strconv.ParseInt(job, 10, 64)
if err != nil {
log.Warn("Error while adding tag to job")
cclog.Warn("Error while adding tag to job")
return nil, err
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
tid, err := strconv.ParseInt(tagId, 10, 64)
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
log.Warn("Error while parsing tag id")
cclog.Warn("Error while parsing tag id")
return nil, err
}
if tags, err = r.Repo.AddTag(repository.GetUserFromContext(ctx), jid, tid); err != nil {
log.Warn("Error while adding tag")
return nil, err
// Test Exists
_, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists {
cclog.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
}
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
user.Username == tscope {
// Add to Job
if tags, err = r.Repo.AddTag(user, jid, tid); err != nil {
cclog.Warn("Error while adding tag")
return nil, err
}
} else {
cclog.Warnf("Not authorized to add tag: %d", tid)
return nil, fmt.Errorf("not authorized to add tag: %d", tid)
}
}
@@ -167,40 +211,148 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
// RemoveTagsFromJob is the resolver for the removeTagsFromJob field.
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
// Removable Tags Pre-Filtered by Scope in Frontend: No backend check required
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
jid, err := strconv.ParseInt(job, 10, 64)
if err != nil {
log.Warn("Error while parsing job id")
cclog.Warn("Error while parsing job id")
return nil, err
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
tid, err := strconv.ParseInt(tagId, 10, 64)
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
log.Warn("Error while parsing tag id")
cclog.Warn("Error while parsing tag id")
return nil, err
}
if tags, err = r.Repo.RemoveTag(repository.GetUserFromContext(ctx), jid, tid); err != nil {
log.Warn("Error while removing tag")
return nil, err
// Test Exists
_, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists {
cclog.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
}
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
user.Username == tscope {
// Remove from Job
if tags, err = r.Repo.RemoveTag(user, jid, tid); err != nil {
cclog.Warn("Error while removing tag")
return nil, err
}
} else {
cclog.Warnf("Not authorized to remove tag: %d", tid)
return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
}
}
return tags, nil
}
// RemoveTagFromList is the resolver for the removeTagFromList field.
func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []string) ([]int, error) {
// Needs Contextuser
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
tags := []int{}
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
cclog.Warn("Error while parsing tag id for removal")
return nil, err
}
// Test Exists
_, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists {
cclog.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
}
// Test Access: Admins && Admin Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
// Remove from DB
if err = r.Repo.RemoveTagByID(tid); err != nil {
cclog.Warn("Error while removing tag")
return nil, err
} else {
tags = append(tags, int(tid))
}
} else {
cclog.Warnf("Not authorized to remove tag: %d", tid)
return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
}
}
return tags, nil
}
// UpdateConfiguration is the resolver for the updateConfiguration field.
func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string, value string) (*string, error) {
if err := repository.GetUserCfgRepo().UpdateConfig(name, value, repository.GetUserFromContext(ctx)); err != nil {
log.Warn("Error while updating user config")
cclog.Warn("Error while updating user config")
return nil, err
}
return nil, nil
}
// ID is the resolver for the id field.
func (r *nodeResolver) ID(ctx context.Context, obj *schema.Node) (string, error) {
panic(fmt.Errorf("not implemented: ID - id"))
}
// SchedulerState is the resolver for the schedulerState field.
func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (schema.SchedulerState, error) {
if obj.NodeState != "" {
return obj.NodeState, nil
} else {
return "", fmt.Errorf("resolver: no SchedulerState (NodeState) on node object")
}
}
// HealthState is the resolver for the healthState field.
func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (string, error) {
if obj.HealthState != "" {
return string(obj.HealthState), nil
} else {
return "", fmt.Errorf("resolver: no HealthState (NodeState) on node object")
}
}
// MetaData is the resolver for the metaData field.
func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) {
if obj.MetaData != nil {
return obj.MetaData, nil
} else {
cclog.Debug("resolver: no MetaData (NodeState) on node object")
emptyMeta := make(map[string]string, 0)
return emptyMeta, nil
}
}
// HealthData is the resolver for the healthData field.
func (r *nodeResolver) HealthData(ctx context.Context, obj *schema.Node) (any, error) {
if obj.HealthData != nil {
return obj.HealthData, nil
} else {
cclog.Debug("resolver: no HealthData (NodeState) on node object")
emptyHealth := make(map[string][]string, 0)
return emptyHealth, nil
}
}
// Clusters is the resolver for the clusters field.
func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error) {
return archive.Clusters, nil
@@ -213,6 +365,14 @@ func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
// GlobalMetrics is the resolver for the globalMetrics field.
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
user := repository.GetUserFromContext(ctx)
if user != nil {
if user.HasRole(schema.RoleUser) || user.HasRole(schema.RoleManager) {
return archive.GlobalUserMetricList, nil
}
}
return archive.GlobalMetricList, nil
}
@@ -225,7 +385,7 @@ func (r *queryResolver) User(ctx context.Context, username string) (*model.User,
func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*model.Count, error) {
data, err := r.Repo.AllocatedNodes(cluster)
if err != nil {
log.Warn("Error while fetching allocated nodes")
cclog.Warn("Error while fetching allocated nodes")
return nil, err
}
@@ -240,17 +400,91 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
return counts, nil
}
// Node is the resolver for the node field.
func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
repo := repository.GetNodeRepository()
numericID, err := strconv.ParseInt(id, 10, 64)
if err != nil {
cclog.Warn("Error while parsing job id")
return nil, err
}
return repo.GetNodeByID(numericID, false)
}
// Nodes is the resolver for the nodes field.
func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
repo := repository.GetNodeRepository()
nodes, err := repo.QueryNodes(ctx, filter, nil, order) // Ignore Paging, Order Unused
count := len(nodes)
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
}
// NodesWithMeta is the resolver for the nodesWithMeta field.
func (r *queryResolver) NodesWithMeta(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
// Why Extra Handler? -> graphql.CollectAllFields(ctx) only returns toplevel fields (i.e.: items, count), and not subfields like item.metaData
repo := repository.GetNodeRepository()
nodes, err := repo.QueryNodesWithMeta(ctx, filter, nil, order) // Ignore Paging, Order Unused
count := len(nodes)
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
}
// NodeStates is the resolver for the nodeStates field.
func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) {
repo := repository.GetNodeRepository()
stateCounts, serr := repo.CountStates(ctx, filter, "node_state")
if serr != nil {
cclog.Warnf("Error while counting nodeStates: %s", serr.Error())
return nil, serr
}
healthCounts, herr := repo.CountStates(ctx, filter, "health_state")
if herr != nil {
cclog.Warnf("Error while counting healthStates: %s", herr.Error())
return nil, herr
}
allCounts := append(stateCounts, healthCounts...)
return allCounts, nil
}
// NodeStatesTimed is the resolver for the nodeStatesTimed field.
func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.NodeFilter, typeArg string) ([]*model.NodeStatesTimed, error) {
repo := repository.GetNodeRepository()
if typeArg == "node" {
stateCounts, serr := repo.CountStatesTimed(ctx, filter, "node_state")
if serr != nil {
cclog.Warnf("Error while counting nodeStates in time: %s", serr.Error())
return nil, serr
}
return stateCounts, nil
}
if typeArg == "health" {
healthCounts, herr := repo.CountStatesTimed(ctx, filter, "health_state")
if herr != nil {
cclog.Warnf("Error while counting healthStates in time: %s", herr.Error())
return nil, herr
}
return healthCounts, nil
}
return nil, errors.New("unknown Node State Query Type")
}
// Job is the resolver for the job field.
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
numericId, err := strconv.ParseInt(id, 10, 64)
numericID, err := strconv.ParseInt(id, 10, 64)
if err != nil {
log.Warn("Error while parsing job id")
cclog.Warn("Error while parsing job id")
return nil, err
}
job, err := r.Repo.FindById(ctx, numericId)
job, err := r.Repo.FindByID(ctx, numericID)
if err != nil {
log.Warn("Error while finding job by id")
cclog.Warn("Error while finding job by id")
return nil, err
}
@@ -277,13 +511,13 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
job, err := r.Query().Job(ctx, id)
if err != nil {
log.Warn("Error while querying job for metrics")
cclog.Warn("Error while querying job for metrics")
return nil, err
}
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
data, err := metricdispatch.LoadData(job, metrics, scopes, ctx, *resolution)
if err != nil {
log.Warn("Error while loading job data")
cclog.Warn("Error while loading job data")
return nil, err
}
@@ -301,36 +535,67 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
return res, err
}
// JobMetricStats is the resolver for the jobMetricStats field.
func (r *queryResolver) JobMetricStats(ctx context.Context, id string, metrics []string) ([]*model.JobMetricStatWithName, error) {
// JobStats is the resolver for the jobStats field.
func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []string) ([]*model.NamedStats, error) {
job, err := r.Query().Job(ctx, id)
if err != nil {
log.Warn("Error while querying job for metrics")
cclog.Warnf("Error while querying job %s for metadata", id)
return nil, err
}
data, err := metricDataDispatcher.LoadStatData(job, metrics, ctx)
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
if err != nil {
log.Warn("Error while loading job stat data")
cclog.Warnf("Error while loading jobStats data for job id %s", id)
return nil, err
}
res := []*model.JobMetricStatWithName{}
res := []*model.NamedStats{}
for name, md := range data {
res = append(res, &model.JobMetricStatWithName{
Name: name,
Stats: &md,
res = append(res, &model.NamedStats{
Name: name,
Data: &md,
})
}
return res, err
}
// JobsFootprints is the resolver for the jobsFootprints field.
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
// NOTE: Legacy Naming! This resolver is for normalized histograms in analysis view only - *Not* related to DB "footprint" column!
return r.jobsFootprints(ctx, filter, metrics)
// ScopedJobStats is the resolver for the scopedJobStats field.
func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.NamedStatsWithScope, error) {
job, err := r.Query().Job(ctx, id)
if err != nil {
cclog.Warnf("Error while querying job %s for metadata", id)
return nil, err
}
data, err := metricdispatch.LoadScopedJobStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
return nil, err
}
res := make([]*model.NamedStatsWithScope, 0)
for name, scoped := range data {
for scope, stats := range scoped {
mdlStats := make([]*model.ScopedStats, 0)
for _, stat := range stats {
mdlStats = append(mdlStats, &model.ScopedStats{
Hostname: stat.Hostname,
ID: stat.ID,
Data: stat.Data,
})
}
res = append(res, &model.NamedStatsWithScope{
Name: name,
Scope: scope,
Stats: mdlStats,
})
}
}
return res, nil
}
// Jobs is the resolver for the jobs field.
@@ -344,40 +609,38 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
jobs, err := r.Repo.QueryJobs(ctx, filter, page, order)
if err != nil {
log.Warn("Error while querying jobs")
cclog.Warn("Error while querying jobs")
return nil, err
}
count, err := r.Repo.CountJobs(ctx, filter)
if err != nil {
log.Warn("Error while counting jobs")
cclog.Warn("Error while counting jobs")
return nil, err
}
if !config.Keys.UiDefaults["job_list_usePaging"].(bool) {
hasNextPage := false
// page.Page += 1 : Simple, but expensive
// Example Page 4 @ 10 IpP : Does item 41 exist?
// Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
// Users can decide in frontend to use continuous scroll, even if app-default is paging!
// Skip if page.ItemsPerPage == -1 ("Load All" -> No Next Page required, Status Dashboards)
/*
Example Page 4 @ 10 IpP : Does item 41 exist?
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
*/
hasNextPage := false
if page.ItemsPerPage != -1 {
nextPage := &model.PageRequest{
ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1),
}
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
if err != nil {
log.Warn("Error while querying next jobs")
cclog.Warn("Error while querying next jobs")
return nil, err
}
if len(nextJobs) == 1 {
hasNextPage = true
}
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
} else {
return &model.JobResultList{Items: jobs, Count: &count}, nil
hasNextPage = len(nextJobs) == 1
}
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
}
// JobsStatistics is the resolver for the jobsStatistics field.
@@ -386,10 +649,10 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
var stats []*model.JobsStatistics
// Top Level Defaults
var defaultDurationBins string = "1h"
var defaultMetricBins int = 10
defaultDurationBins := "1h"
defaultMetricBins := 10
if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
if requireField(ctx, "totalJobs") || requireField(ctx, "totalUsers") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") {
if groupBy == nil {
stats, err = r.Repo.JobsStats(ctx, filter)
@@ -456,6 +719,62 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
return stats, nil
}
// JobsMetricStats is the resolver for the jobsMetricStats field.
func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.JobFilter, metrics []string) ([]*model.JobStats, error) {
// No Paging, Fixed Order by StartTime ASC
order := &model.OrderByInput{
Field: "startTime",
Type: "col",
Order: "ASC",
}
jobs, err := r.Repo.QueryJobs(ctx, filter, nil, order)
if err != nil {
cclog.Warn("Error while querying jobs for comparison")
return nil, err
}
res := []*model.JobStats{}
for _, job := range jobs {
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
continue
// return nil, err
}
sres := []*model.NamedStats{}
for name, md := range data {
sres = append(sres, &model.NamedStats{
Name: name,
Data: &md,
})
}
numThreadsInt := int(job.NumHWThreads)
numAccsInt := int(job.NumAcc)
res = append(res, &model.JobStats{
ID: int(*job.ID),
JobID: strconv.Itoa(int(job.JobID)),
StartTime: int(job.StartTime),
Duration: int(job.Duration),
Cluster: job.Cluster,
SubCluster: job.SubCluster,
NumNodes: int(job.NumNodes),
NumHWThreads: &numThreadsInt,
NumAccelerators: &numAccsInt,
Stats: sres,
})
}
return res, err
}
// JobsFootprints is the resolver for the jobsFootprints field.
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
// NOTE: Legacy Naming! This resolver is for normalized histograms in analysis view only - *Not* related to DB "footprint" column!
return r.jobsFootprints(ctx, filter, metrics)
}
// RooflineHeatmap is the resolver for the rooflineHeatmap field.
func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
return r.rooflineHeatmap(ctx, filter, rows, cols, minX, minY, maxX, maxY)
@@ -468,27 +787,37 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return nil, errors.New("you need to be administrator or support staff for this query")
}
defaultMetrics := make([]string, 0)
for _, mc := range archive.GetCluster(cluster).MetricConfig {
defaultMetrics = append(defaultMetrics, mc.Name)
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
metrics = defaultMetrics
} else {
metrics = slices.DeleteFunc(metrics, func(metric string) bool {
return !slices.Contains(defaultMetrics, metric) // Remove undefined metrics.
})
}
data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
data, err := metricdispatch.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
log.Warn("error while loading node data")
cclog.Warn("error while loading node data")
return nil, err
}
nodeRepo := repository.GetNodeRepository()
stateMap, _ := nodeRepo.MapNodes(cluster)
nodeMetrics := make([]*model.NodeMetrics, 0, len(data))
for hostname, metrics := range data {
host := &model.NodeMetrics{
Host: hostname,
State: stateMap[hostname],
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
}
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
if err != nil {
log.Warnf("error in nodeMetrics resolver: %s", err)
cclog.Warnf("error in nodeMetrics resolver: %s", err)
}
for metric, scopedMetrics := range metrics {
@@ -508,7 +837,7 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
}
// NodeMetricsList is the resolver for the nodeMetricsList field.
func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, subCluster string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) (*model.NodesResultList, error) {
func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, subCluster string, stateFilter string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) (*model.NodesResultList, error) {
if resolution == nil { // Load from Config
if config.Keys.EnableResampling != nil {
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
@@ -524,30 +853,39 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
return nil, errors.New("you need to be administrator or support staff for this query")
}
nodeRepo := repository.GetNodeRepository()
// nodes -> array hostname
nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page)
if nerr != nil {
return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList")
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
}
data, totalNodes, hasNextPage, err := metricDataDispatcher.LoadNodeListData(cluster, subCluster, nodeFilter, metrics, scopes, *resolution, from, to, page, ctx)
// data -> map hostname:jobdata
data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
if err != nil {
log.Warn("error while loading node data")
cclog.Warn("error while loading node data (Resolver.NodeMetricsList")
return nil, err
}
nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
for hostname, metrics := range data {
for _, hostname := range nodes {
host := &model.NodeMetrics{
Host: hostname,
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
State: stateMap[hostname],
Metrics: make([]*model.JobMetricWithName, 0),
}
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
if err != nil {
log.Warnf("error in nodeMetrics resolver: %s", err)
cclog.Warnf("error in nodeMetrics resolver: %s", err)
}
for metric, scopedMetrics := range metrics {
for metric, scopedMetrics := range data[hostname] {
for scope, scopedMetric := range scopedMetrics {
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
Name: metric,
@@ -561,14 +899,108 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
}
nodeMetricsListResult := &model.NodesResultList{
Items: nodeMetricsList,
TotalNodes: &totalNodes,
Items: nodeMetricsList,
// TotalNodes depends on sum of nodes grouped on latest timestamp, see repo/node.go:357
TotalNodes: &countNodes,
HasNextPage: &hasNextPage,
}
return nodeMetricsListResult, nil
}
// ClusterMetrics is the resolver for the clusterMetrics field.
func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metrics []string, from time.Time, to time.Time) (*model.ClusterMetrics, error) {
user := repository.GetUserFromContext(ctx)
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
return nil, errors.New("you need to be administrator or support staff for this query")
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
}
// 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow
scopes := []schema.MetricScope{"node"}
data, err := metricdispatch.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
if err != nil {
cclog.Warn("error while loading node data")
return nil, err
}
clusterMetricData := make([]*model.ClusterMetricWithName, 0)
clusterMetrics := model.ClusterMetrics{NodeCount: 0, Metrics: clusterMetricData}
collectorTimestep := make(map[string]int)
collectorUnit := make(map[string]schema.Unit)
collectorData := make(map[string][]schema.Float)
for _, metrics := range data {
clusterMetrics.NodeCount += 1
for metric, scopedMetrics := range metrics {
for _, scopedMetric := range scopedMetrics {
// Collect Info Once
_, okTimestep := collectorTimestep[metric]
if !okTimestep {
collectorTimestep[metric] = scopedMetric.Timestep
}
_, okUnit := collectorUnit[metric]
if !okUnit {
collectorUnit[metric] = scopedMetric.Unit
}
// Collect Data
for _, ser := range scopedMetric.Series {
_, okData := collectorData[metric]
// Init With Datasize > 0
if !okData && len(ser.Data) != 0 {
collectorData[metric] = make([]schema.Float, len(ser.Data))
} else if !okData {
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data))
}
// Sum if init'd and matching size
if okData && len(ser.Data) == len(collectorData[metric]) {
for i, val := range ser.Data {
if val.IsNaN() {
continue
} else {
collectorData[metric][i] += val
}
}
} else if okData {
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
}
}
}
}
}
for metricName, data := range collectorData {
// use ccUnits for backend normalization to "Tera"
p_old := ccunit.NewPrefix(collectorUnit[metricName].Prefix)
p_new := ccunit.NewPrefix("T")
convFunc := ccunit.GetPrefixPrefixFactor(p_old, p_new)
u_new := schema.Unit{Prefix: p_new.Prefix(), Base: collectorUnit[metricName].Base}
roundedData := make([]schema.Float, 0)
for _, v_old := range data {
v_new := math.Round(convFunc(float64(v_old)).(float64)*100.0) / 100.0
roundedData = append(roundedData, schema.Float(v_new))
}
cm := model.ClusterMetricWithName{
Name: metricName,
Unit: &u_new,
Timestep: collectorTimestep[metricName],
Data: roundedData,
}
clusterMetrics.Metrics = append(clusterMetrics.Metrics, &cm)
}
return &clusterMetrics, nil
}
// NumberOfNodes is the resolver for the numberOfNodes field.
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
nodeList, err := archive.ParseNodeList(obj.Nodes)
@@ -590,6 +1022,9 @@ func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricV
// Mutation returns generated.MutationResolver implementation.
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
// Node returns generated.NodeResolver implementation.
func (r *Resolver) Node() generated.NodeResolver { return &nodeResolver{r} }
// Query returns generated.QueryResolver implementation.
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
@@ -600,5 +1035,6 @@ type clusterResolver struct{ *Resolver }
type jobResolver struct{ *Resolver }
type metricValueResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver }
type nodeResolver struct{ *Resolver }
type queryResolver struct{ *Resolver }
type subClusterResolver struct{ *Resolver }

View File

@@ -1,20 +1,21 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package graph
import (
"context"
"fmt"
"math"
"slices"
"github.com/99designs/gqlgen/graphql"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
// "github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
const MAX_JOBS_FOR_ANALYSIS = 500
@@ -28,7 +29,7 @@ func (r *queryResolver) rooflineHeatmap(
) ([][]float64, error) {
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
if err != nil {
log.Error("Error while querying jobs for roofline")
cclog.Error("Error while querying jobs for roofline")
return nil, err
}
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
@@ -54,15 +55,15 @@ func (r *queryResolver) rooflineHeatmap(
// resolution = max(resolution, mc.Timestep)
// }
jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
if err != nil {
log.Errorf("Error while loading roofline metrics for job %d", job.ID)
cclog.Warnf("Error while loading roofline metrics for job %d", *job.ID)
return nil, err
}
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil {
log.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", *job.ID)
continue
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
}
@@ -70,7 +71,7 @@ func (r *queryResolver) rooflineHeatmap(
flops, ok1 := flops_["node"]
membw, ok2 := membw_["node"]
if !ok1 || !ok2 {
log.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
cclog.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
continue
// TODO/FIXME:
// return nil, errors.New("GRAPH/UTIL > todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
@@ -105,7 +106,7 @@ func (r *queryResolver) rooflineHeatmap(
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
if err != nil {
log.Error("Error while querying jobs for footprint")
cclog.Error("Error while querying jobs for footprint")
return nil, err
}
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
@@ -127,8 +128,8 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
continue
}
if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
log.Error("Error while loading averages for footprint")
if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil {
cclog.Error("Error while loading averages for footprint")
return nil, err
}
@@ -186,11 +187,5 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
func requireField(ctx context.Context, name string) bool {
fields := graphql.CollectAllFields(ctx)
for _, f := range fields {
if f == name {
return true
}
}
return false
return slices.Contains(fields, name)
}

132
internal/importer/README.md Normal file
View File

@@ -0,0 +1,132 @@
# Importer Package
The `importer` package provides functionality for importing job data into the ClusterCockpit database from archived job files.
## Overview
This package supports two primary import workflows:
1. **Bulk Database Initialization** - Reinitialize the entire job database from archived jobs
2. **Individual Job Import** - Import specific jobs from metadata/data file pairs
Both workflows enrich job metadata by calculating performance footprints and energy consumption metrics before persisting to the database.
## Main Entry Points
### InitDB()
Reinitializes the job database from all archived jobs.
```go
if err := importer.InitDB(); err != nil {
log.Fatal(err)
}
```
This function:
- Flushes existing job, tag, and jobtag tables
- Iterates through all jobs in the configured archive
- Enriches each job with calculated metrics
- Inserts jobs into the database in batched transactions (100 jobs per batch)
- Continues on individual job failures, logging errors
**Use Case**: Initial database setup or complete database rebuild from archive.
### HandleImportFlag(flag string)
Imports jobs from specified file pairs.
```go
// Format: "<meta.json>:<data.json>[,<meta2.json>:<data2.json>,...]"
flag := "/path/to/meta.json:/path/to/data.json"
if err := importer.HandleImportFlag(flag); err != nil {
log.Fatal(err)
}
```
This function:
- Parses the comma-separated file pairs
- Validates metadata and job data against schemas (if validation enabled)
- Enriches each job with footprints and energy metrics
- Imports jobs into both the archive and database
- Fails fast on the first error
**Use Case**: Importing specific jobs from external sources or manual job additions.
## Job Enrichment
Both import workflows use `enrichJobMetadata()` to calculate:
### Performance Footprints
Performance footprints are calculated from metric averages based on the subcluster configuration:
```go
job.Footprint["mem_used_avg"] = 45.2 // GB
job.Footprint["cpu_load_avg"] = 0.87 // percentage
```
### Energy Metrics
Energy consumption is calculated from power metrics using the formula:
```
Energy (kWh) = (Power (W) × Duration (s) / 3600) / 1000
```
For each energy metric:
```go
job.EnergyFootprint["acc_power"] = 12.5 // kWh
job.Energy = 150.2 // Total energy in kWh
```
**Note**: Energy calculations for metrics with unit "energy" (Joules) are not yet implemented.
## Data Validation
### SanityChecks(job *schema.Job)
Validates job metadata before database insertion:
- Cluster exists in configuration
- Subcluster is valid (assigns if needed)
- Job state is valid
- Resources and user fields are populated
- Node counts and hardware thread counts are positive
- Resource count matches declared node count
## Normalization Utilities
The package includes utilities for normalizing metric values to appropriate SI prefixes:
### Normalize(avg float64, prefix string)
Adjusts values and SI prefixes for readability:
```go
factor, newPrefix := importer.Normalize(2048.0, "M")
// Converts 2048 MB → ~2.0 GB
// Returns: factor for conversion, "G"
```
This is useful for automatically scaling metrics (e.g., memory, storage) to human-readable units.
## Dependencies
- `github.com/ClusterCockpit/cc-backend/internal/repository` - Database operations
- `github.com/ClusterCockpit/cc-backend/pkg/archive` - Job archive access
- `github.com/ClusterCockpit/cc-lib/schema` - Job schema definitions
- `github.com/ClusterCockpit/cc-lib/ccLogger` - Logging
- `github.com/ClusterCockpit/cc-lib/ccUnits` - SI unit handling
## Error Handling
- **InitDB**: Continues processing on individual job failures, logs errors, returns summary
- **HandleImportFlag**: Fails fast on first error, returns immediately
- Both functions log detailed error context for debugging
## Performance
- **Transaction Batching**: InitDB processes jobs in batches of 100 for optimal database performance
- **Tag Caching**: Tag IDs are cached during import to minimize database queries
- **Progress Reporting**: InitDB prints progress updates during bulk operations

View File

@@ -1,29 +1,44 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
import (
"bytes"
"encoding/json"
"fmt"
"math"
"os"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// Import all jobs specified as `<path-to-meta.json>:<path-to-data.json>,...`
// HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.
//
// The flag format is: "<path-to-meta.json>:<path-to-data.json>[,<path-to-meta2.json>:<path-to-data2.json>,...]"
//
// For each job pair, this function:
// 1. Reads and validates the metadata JSON file (schema.Job)
// 2. Reads and validates the job data JSON file (schema.JobData)
// 3. Enriches the job with calculated footprints and energy metrics
// 4. Validates the job using SanityChecks()
// 5. Imports the job into the archive
// 6. Inserts the job into the database with associated tags
//
// Schema validation is performed if config.Keys.Validate is true.
//
// Returns an error if file reading, validation, enrichment, or database operations fail.
// The function stops processing on the first error encountered.
func HandleImportFlag(flag string) error {
r := repository.GetJobRepository()
for _, pair := range strings.Split(flag, ",") {
for pair := range strings.SplitSeq(flag, ",") {
files := strings.Split(pair, ":")
if len(files) != 2 {
return fmt.Errorf("REPOSITORY/INIT > invalid import flag format")
@@ -31,7 +46,7 @@ func HandleImportFlag(flag string) error {
raw, err := os.ReadFile(files[0])
if err != nil {
log.Warn("Error while reading metadata file for import")
cclog.Warn("Error while reading metadata file for import")
return err
}
@@ -42,15 +57,18 @@ func HandleImportFlag(flag string) error {
}
dec := json.NewDecoder(bytes.NewReader(raw))
dec.DisallowUnknownFields()
job := schema.JobMeta{BaseJob: schema.JobDefaults}
job := schema.Job{
Shared: "none",
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
if err = dec.Decode(&job); err != nil {
log.Warn("Error while decoding raw json metadata for import")
cclog.Warn("Error while decoding raw json metadata for import")
return err
}
raw, err = os.ReadFile(files[1])
if err != nil {
log.Warn("Error while reading jobdata file for import")
cclog.Warn("Error while reading jobdata file for import")
return err
}
@@ -63,108 +81,41 @@ func HandleImportFlag(flag string) error {
dec.DisallowUnknownFields()
jobData := schema.JobData{}
if err = dec.Decode(&jobData); err != nil {
log.Warn("Error while decoding raw json jobdata for import")
cclog.Warn("Error while decoding raw json jobdata for import")
return err
}
job.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster)
if err != nil {
log.Errorf("cannot get subcluster: %s", err.Error())
if err = enrichJobMetadata(&job); err != nil {
cclog.Errorf("Error enriching job metadata: %v", err)
return err
}
job.Footprint = make(map[string]float64)
for _, fp := range sc.Footprint {
statType := "avg"
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
statType = sc.MetricConfig[i].Footprint
}
name := fmt.Sprintf("%s_%s", fp, statType)
job.Footprint[name] = repository.LoadJobStat(&job, fp, statType)
}
job.RawFootprint, err = json.Marshal(job.Footprint)
if err != nil {
log.Warn("Error while marshaling job footprint")
return err
}
job.EnergyFootprint = make(map[string]float64)
// Total Job Energy Outside Loop
totalEnergy := 0.0
for _, fp := range sc.EnergyFootprint {
// Always Init Metric Energy Inside Loop
metricEnergy := 0.0
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
// Note: For DB data, calculate and save as kWh
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules)
log.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", job.JobID, job.Cluster, fp)
// FIXME: Needs sum as stats type
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
// Energy: Power (in Watts) * Time (in Seconds)
// Unit: (W * (s / 3600)) / 1000 = kWh
// Round 2 Digits: round(Energy * 100) / 100
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
rawEnergy := ((repository.LoadJobStat(&job, fp, "avg") * float64(job.NumNodes)) * (float64(job.Duration) / 3600.0)) / 1000.0
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
}
} else {
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
}
job.EnergyFootprint[fp] = metricEnergy
totalEnergy += metricEnergy
}
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
return err
}
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
log.Warn("Error while marshaling job resources")
return err
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
log.Warn("Error while marshaling job metadata")
return err
}
if err = SanityChecks(&job.BaseJob); err != nil {
log.Warn("BaseJob SanityChecks failed")
if err = SanityChecks(&job); err != nil {
cclog.Warn("BaseJob SanityChecks failed")
return err
}
if err = archive.GetHandle().ImportJob(&job, &jobData); err != nil {
log.Error("Error while importing job")
cclog.Error("Error while importing job")
return err
}
id, err := r.InsertJob(&job)
id, err := r.InsertJobDirect(&job)
if err != nil {
log.Warn("Error while job db insert")
cclog.Warn("Error while job db insert")
return err
}
for _, tag := range job.Tags {
if err := r.ImportTag(id, tag.Type, tag.Name, tag.Scope); err != nil {
log.Error("Error while adding or creating tag on import")
cclog.Error("Error while adding or creating tag on import")
return err
}
}
log.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
cclog.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
}
return nil
}

View File

@@ -1,5 +1,5 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer_test
@@ -16,9 +16,12 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
// copyFile copies a file from source path to destination path.
// Used by tests to set up test fixtures.
func copyFile(s string, d string) error {
r, err := os.Open(s)
if err != nil {
@@ -34,59 +37,40 @@ func copyFile(s string, d string) error {
return nil
}
// setup initializes a test environment for importer tests.
//
// Creates a temporary directory with:
// - A test job archive with cluster configuration
// - A SQLite database initialized with schema
// - Configuration files loaded
//
// Returns a JobRepository instance for test assertions.
func setup(t *testing.T) *repository.JobRepository {
const testconfig = `{
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"api-allowed-ips": [
"*"
]},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"jwts": {
"max-age": "2m"
},
"clusters": [
{
"name": "testcluster",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
},
{
"name": "fritz",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 944 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
},
{
"name": "taurus",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 4000 },
"duration": { "from": 0, "to": 604800 },
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
}
}
]}`
}
}`
log.Init("info", true)
cclog.Init("info", true)
tmpdir := t.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
if err := os.Mkdir(jobarchive, 0777); err != nil {
if err := os.Mkdir(jobarchive, 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil {
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
t.Fatal(err)
}
fritzArchive := filepath.Join(tmpdir, "job-archive", "fritz")
if err := os.Mkdir(fritzArchive, 0777); err != nil {
if err := os.Mkdir(fritzArchive, 0o777); err != nil {
t.Fatal(err)
}
if err := copyFile(filepath.Join("testdata", "cluster-fritz.json"),
@@ -95,27 +79,36 @@ func setup(t *testing.T) *repository.JobRepository {
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB("sqlite3", dbfilepath)
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
cfgFilePath := filepath.Join(tmpdir, "config.json")
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0666); err != nil {
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
t.Fatal(err)
}
config.Init(cfgFilePath)
ccconf.Init(cfgFilePath)
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
config.Init(cfg)
} else {
t.Fatal("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
t.Fatal(err)
}
repository.Connect("sqlite3", dbfilepath)
repository.Connect(dbfilepath)
return repository.GetJobRepository()
}
// Result represents the expected test result for job import verification.
type Result struct {
JobId int64
Cluster string
@@ -123,6 +116,8 @@ type Result struct {
Duration int32
}
// readResult reads the expected test result from a golden file.
// Golden files contain the expected job attributes after import.
func readResult(t *testing.T, testname string) Result {
var r Result
@@ -140,6 +135,13 @@ func readResult(t *testing.T, testname string) Result {
return r
}
// TestHandleImportFlag tests the HandleImportFlag function with various job import scenarios.
//
// The test uses golden files in testdata/ to verify that jobs are correctly:
// - Parsed from metadata and data JSON files
// - Enriched with footprints and energy metrics
// - Inserted into the database
// - Retrievable with correct attributes
func TestHandleImportFlag(t *testing.T) {
r := setup(t)

View File

@@ -1,7 +1,16 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package importer provides functionality for importing job data into the ClusterCockpit database.
//
// The package supports two primary use cases:
// 1. Bulk database initialization from archived jobs via InitDB()
// 2. Individual job import from file pairs via HandleImportFlag()
//
// Both operations enrich job metadata by calculating footprints and energy metrics
// before persisting to the database.
package importer
import (
@@ -13,8 +22,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
const (
@@ -22,25 +31,38 @@ const (
setTagQuery = "INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)"
)
// Delete the tables "job", "tag" and "jobtag" from the database and
// repopulate them using the jobs found in `archive`.
// InitDB reinitializes the job database from archived job data.
//
// This function performs the following operations:
// 1. Flushes existing job, tag, and jobtag tables
// 2. Iterates through all jobs in the archive
// 3. Enriches each job with calculated footprints and energy metrics
// 4. Inserts jobs and tags into the database in batched transactions
//
// Jobs are processed in batches of 100 for optimal performance. The function
// continues processing even if individual jobs fail, logging errors and
// returning a summary at the end.
//
// Returns an error if database initialization, transaction management, or
// critical operations fail. Individual job failures are logged but do not
// stop the overall import process.
func InitDB() error {
r := repository.GetJobRepository()
if err := r.Flush(); err != nil {
log.Errorf("repository initDB(): %v", err)
cclog.Errorf("repository initDB(): %v", err)
return err
}
starttime := time.Now()
log.Print("Building job table...")
cclog.Print("Building job table...")
t, err := r.TransactionInit()
if err != nil {
log.Warn("Error while initializing SQL transactions")
cclog.Warn("Error while initializing SQL transactions")
return err
}
tags := make(map[string]int64)
// Not using log.Print because we want the line to end with `\r` and
// Not using cclog.Print because we want the line to end with `\r` and
// this function is only ever called when a special command line flag
// is passed anyways.
fmt.Printf("%d jobs inserted...\r", 0)
@@ -52,150 +74,195 @@ func InitDB() error {
for jobContainer := range ar.Iter(false) {
jobMeta := jobContainer.Meta
if jobMeta == nil {
cclog.Warn("skipping job with nil metadata")
errorOccured++
continue
}
// Bundle 100 inserts into one transaction for better performance
if i%100 == 0 {
r.TransactionCommit(t)
if i > 0 {
if err := t.Commit(); err != nil {
cclog.Errorf("transaction commit error: %v", err)
return err
}
// Start a new transaction for the next batch
t, err = r.TransactionInit()
if err != nil {
cclog.Errorf("transaction init error: %v", err)
return err
}
}
fmt.Printf("%d jobs inserted...\r", i)
}
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
job := schema.Job{
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
StartTimeUnix: jobMeta.StartTime,
}
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
if err != nil {
log.Errorf("cannot get subcluster: %s", err.Error())
return err
}
job.Footprint = make(map[string]float64)
for _, fp := range sc.Footprint {
statType := "avg"
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
statType = sc.MetricConfig[i].Footprint
}
name := fmt.Sprintf("%s_%s", fp, statType)
job.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType)
}
job.RawFootprint, err = json.Marshal(job.Footprint)
if err != nil {
log.Warn("Error while marshaling job footprint")
return err
}
job.EnergyFootprint = make(map[string]float64)
// Total Job Energy Outside Loop
totalEnergy := 0.0
for _, fp := range sc.EnergyFootprint {
// Always Init Metric Energy Inside Loop
metricEnergy := 0.0
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
// Note: For DB data, calculate and save as kWh
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules)
log.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp)
// FIXME: Needs sum as stats type
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
// Energy: Power (in Watts) * Time (in Seconds)
// Unit: (W * (s / 3600)) / 1000 = kWh
// Round 2 Digits: round(Energy * 100) / 100
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
rawEnergy := ((repository.LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
}
} else {
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)
}
job.EnergyFootprint[fp] = metricEnergy
totalEnergy += metricEnergy
}
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
return err
}
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
log.Errorf("repository initDB(): %v", err)
if err := enrichJobMetadata(jobMeta); err != nil {
cclog.Errorf("repository initDB(): %v", err)
errorOccured++
continue
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
log.Errorf("repository initDB(): %v", err)
if err := SanityChecks(jobMeta); err != nil {
cclog.Errorf("repository initDB(): %v", err)
errorOccured++
continue
}
if err := SanityChecks(&job.BaseJob); err != nil {
log.Errorf("repository initDB(): %v", err)
id, jobErr := r.TransactionAddNamed(t,
repository.NamedJobInsert, jobMeta)
if jobErr != nil {
cclog.Errorf("repository initDB(): %v", jobErr)
errorOccured++
continue
}
id, err := r.TransactionAddNamed(t,
repository.NamedJobInsert, job)
if err != nil {
log.Errorf("repository initDB(): %v", err)
errorOccured++
continue
}
// Job successfully inserted, increment counter
i += 1
for _, tag := range job.Tags {
for _, tag := range jobMeta.Tags {
tagstr := tag.Name + ":" + tag.Type
tagId, ok := tags[tagstr]
tagID, ok := tags[tagstr]
if !ok {
tagId, err = r.TransactionAdd(t,
var err error
tagID, err = r.TransactionAdd(t,
addTagQuery,
tag.Name, tag.Type)
if err != nil {
log.Errorf("Error adding tag: %v", err)
cclog.Errorf("Error adding tag: %v", err)
errorOccured++
continue
}
tags[tagstr] = tagId
tags[tagstr] = tagID
}
r.TransactionAdd(t,
setTagQuery,
id, tagId)
}
if err == nil {
i += 1
id, tagID)
}
}
if errorOccured > 0 {
log.Warnf("Error in import of %d jobs!", errorOccured)
cclog.Warnf("Error in import of %d jobs!", errorOccured)
}
r.TransactionEnd(t)
log.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
cclog.Infof("A total of %d jobs have been registered in %.3f seconds.", i, time.Since(starttime).Seconds())
return nil
}
// This function also sets the subcluster if necessary!
func SanityChecks(job *schema.BaseJob) error {
// enrichJobMetadata calculates and populates job footprints, energy metrics, and serialized fields.
//
// This function performs the following enrichment operations:
// 1. Calculates job footprint metrics based on the subcluster configuration
// 2. Computes energy footprint and total energy consumption in kWh
// 3. Marshals footprints, resources, and metadata into JSON for database storage
//
// The function expects the job's MonitoringStatus and SubCluster to be already set.
// Energy calculations convert power metrics (Watts) to energy (kWh) using the formula:
//
// Energy (kWh) = (Power (W) * Duration (s) / 3600) / 1000
//
// Returns an error if subcluster retrieval, metric indexing, or JSON marshaling fails.
func enrichJobMetadata(job *schema.Job) error {
sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("cannot get subcluster: %s", err.Error())
return err
}
job.Footprint = make(map[string]float64)
for _, fp := range sc.Footprint {
statType := "avg"
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
statType = sc.MetricConfig[i].Footprint
}
name := fmt.Sprintf("%s_%s", fp, statType)
job.Footprint[name] = repository.LoadJobStat(job, fp, statType)
}
job.RawFootprint, err = json.Marshal(job.Footprint)
if err != nil {
cclog.Warn("Error while marshaling job footprint")
return err
}
job.EnergyFootprint = make(map[string]float64)
// Total Job Energy Outside Loop
totalEnergy := 0.0
for _, fp := range sc.EnergyFootprint {
// Always Init Metric Energy Inside Loop
metricEnergy := 0.0
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
// Note: For DB data, calculate and save as kWh
switch sc.MetricConfig[i].Energy {
case "energy": // this metric has energy as unit (Joules)
cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", job.JobID, job.Cluster, fp)
// FIXME: Needs sum as stats type
case "power": // this metric has power as unit (Watt)
// Energy: Power (in Watts) * Time (in Seconds)
// Unit: (W * (s / 3600)) / 1000 = kWh
// Round 2 Digits: round(Energy * 100) / 100
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
rawEnergy := ((repository.LoadJobStat(job, fp, "avg") * float64(job.NumNodes)) * (float64(job.Duration) / 3600.0)) / 1000.0
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
}
} else {
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, *job.ID)
}
job.EnergyFootprint[fp] = metricEnergy
totalEnergy += metricEnergy
}
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", *job.ID)
return err
}
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
cclog.Warn("Error while marshaling job resources")
return err
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
cclog.Warn("Error while marshaling job metadata")
return err
}
return nil
}
// SanityChecks validates job metadata and ensures cluster/subcluster configuration is valid.
//
// This function performs the following validations:
// 1. Verifies the cluster exists in the archive configuration
// 2. Assigns and validates the subcluster (may modify job.SubCluster)
// 3. Validates job state is a recognized value
// 4. Ensures resources and user fields are populated
// 5. Validates node counts and hardware thread counts are positive
// 6. Verifies the number of resources matches the declared node count
//
// The function may modify the job's SubCluster field if it needs to be assigned.
//
// Returns an error if any validation check fails.
func SanityChecks(job *schema.Job) error {
if c := archive.GetCluster(job.Cluster); c == nil {
return fmt.Errorf("no such cluster: %v", job.Cluster)
}
if err := archive.AssignSubCluster(job); err != nil {
log.Warn("Error while assigning subcluster to job")
cclog.Warn("Error while assigning subcluster to job")
return err
}
if !job.State.Valid() {
@@ -214,6 +281,14 @@ func SanityChecks(job *schema.BaseJob) error {
return nil
}
// checkJobData normalizes metric units in job data based on average values.
//
// NOTE: This function is currently unused and contains incomplete implementation.
// It was intended to normalize byte and file-related metrics to appropriate SI prefixes,
// but the normalization logic is commented out. Consider removing or completing this
// function based on project requirements.
//
// TODO: Either implement the metric normalization or remove this dead code.
func checkJobData(d *schema.JobData) error {
for _, scopes := range *d {
// var newUnit schema.Unit

View File

@@ -1,19 +1,34 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
import (
"math"
ccunits "github.com/ClusterCockpit/cc-units"
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
)
// getNormalizationFactor calculates the scaling factor needed to normalize a value
// to a more readable range (typically between 1.0 and 1000.0).
//
// For values greater than 1000, the function scales down by factors of 1000 (returns negative exponent).
// For values less than 1.0, the function scales up by factors of 1000 (returns positive exponent).
//
// Returns:
// - factor: The multiplicative factor to apply (10^(count*scale))
// - exponent: The power of 10 representing the adjustment (multiple of 3 for SI prefixes)
func getNormalizationFactor(v float64) (float64, int) {
count := 0
scale := -3
// Prevent infinite loop for zero or negative values
if v <= 0.0 {
return 1.0, 0
}
if v > 1000.0 {
for v > 1000.0 {
v *= 1e-3
@@ -29,9 +44,22 @@ func getNormalizationFactor(v float64) (float64, int) {
return math.Pow10(count * scale), count * scale
}
// getExponent calculates the SI prefix exponent from a numeric prefix value.
//
// For example:
// - Input: 1000.0 (kilo) returns 3
// - Input: 1000000.0 (mega) returns 6
// - Input: 1000000000.0 (giga) returns 9
//
// Returns the exponent representing the power of 10 for the SI prefix.
func getExponent(p float64) int {
count := 0
// Prevent infinite loop for infinity or NaN values
if math.IsInf(p, 0) || math.IsNaN(p) || p <= 0.0 {
return 0
}
for p > 1.0 {
p = p / 1000.0
count++
@@ -40,12 +68,42 @@ func getExponent(p float64) int {
return count * 3
}
// newPrefixFromFactor computes a new SI unit prefix after applying a normalization factor.
//
// Given an original prefix and an exponent adjustment, this function calculates
// the resulting SI prefix. For example, if normalizing from bytes (no prefix) by
// a factor of 10^9, the result would be the "G" (giga) prefix.
//
// Parameters:
// - op: The original SI prefix value
// - e: The exponent adjustment to apply
//
// Returns the new SI prefix after adjustment.
func newPrefixFromFactor(op ccunits.Prefix, e int) ccunits.Prefix {
f := float64(op)
exp := math.Pow10(getExponent(f) - e)
return ccunits.Prefix(exp)
}
// Normalize adjusts a metric value and its SI unit prefix to a more readable range.
//
// This function is useful for automatically scaling metrics to appropriate units.
// For example, normalizing 2048 MiB might result in ~2.0 GiB.
//
// The function analyzes the average value and determines if a different SI prefix
// would make the number more human-readable (typically keeping values between 1 and 1000).
//
// Parameters:
// - avg: The metric value to normalize
// - p: The current SI prefix as a string (e.g., "K", "M", "G")
//
// Returns:
// - factor: The multiplicative factor to apply to convert the value
// - newPrefix: The new SI prefix string to use
//
// Example:
//
// factor, newPrefix := Normalize(2048.0, "M") // returns factor for MB->GB conversion, "G"
func Normalize(avg float64, p string) (float64, string) {
f, e := getNormalizationFactor(avg)

View File

@@ -1,5 +1,5 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
@@ -8,9 +8,11 @@ import (
"fmt"
"testing"
ccunits "github.com/ClusterCockpit/cc-units"
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
)
// TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix.
// Verifies that values in the billions are correctly scaled to the "G" (giga) prefix.
func TestNormalizeFactor(t *testing.T) {
// var us string
s := []float64{2890031237, 23998994567, 389734042344, 390349424345}
@@ -38,6 +40,8 @@ func TestNormalizeFactor(t *testing.T) {
}
}
// TestNormalizeKeep tests that values already in an appropriate range maintain their prefix.
// Verifies that when values don't require rescaling, the original "G" prefix is preserved.
func TestNormalizeKeep(t *testing.T) {
s := []float64{3.0, 24.0, 390.0, 391.0}

View File

@@ -1 +1 @@
{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"exclusive":1,"monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}}
{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"shared":"none","monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}}

View File

@@ -1 +1 @@
{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"exclusive":1,"jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}}
{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"shared":"none","jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}}

View File

@@ -1,357 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricDataDispatcher
import (
"context"
"fmt"
"math"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
"github.com/ClusterCockpit/cc-backend/pkg/resampler"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
func cacheKey(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) string {
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
job.ID, job.State, metrics, scopes, resolution)
}
// Fetches the metric data for a job.
func LoadData(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ interface{}, ttl time.Duration, size int) {
var jd schema.JobData
var err error
if job.State == schema.JobStateRunning ||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
config.Keys.DisableArchive {
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
}
if scopes == nil {
scopes = append(scopes, schema.MetricScopeNode)
}
if metrics == nil {
cluster := archive.GetCluster(job.Cluster)
for _, mc := range cluster.MetricConfig {
metrics = append(metrics, mc.Name)
}
}
jd, err = repo.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil {
if len(jd) != 0 {
log.Warnf("partial error: %s", err.Error())
// return err, 0, 0 // Reactivating will block archiving on one partial error
} else {
log.Error("Error while loading job data from metric repository")
return err, 0, 0
}
}
size = jd.Size()
} else {
var jd_temp schema.JobData
jd_temp, err = archive.GetHandle().LoadJobData(job)
if err != nil {
log.Error("Error while loading job data from archive")
return err, 0, 0
}
//Deep copy the cached archive hashmap
jd = metricdata.DeepCopy(jd_temp)
//Resampling for archived data.
//Pass the resolution from frontend here.
for _, v := range jd {
for _, v_ := range v {
timestep := 0
for i := 0; i < len(v_.Series); i += 1 {
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, v_.Timestep, resolution)
if err != nil {
return err, 0, 0
}
}
v_.Timestep = timestep
}
}
// Avoid sending unrequested data to the client:
if metrics != nil || scopes != nil {
if metrics == nil {
metrics = make([]string, 0, len(jd))
for k := range jd {
metrics = append(metrics, k)
}
}
res := schema.JobData{}
for _, metric := range metrics {
if perscope, ok := jd[metric]; ok {
if len(perscope) > 1 {
subset := make(map[schema.MetricScope]*schema.JobMetric)
for _, scope := range scopes {
if jm, ok := perscope[scope]; ok {
subset[scope] = jm
}
}
if len(subset) > 0 {
perscope = subset
}
}
res[metric] = perscope
}
}
jd = res
}
size = jd.Size()
}
ttl = 5 * time.Hour
if job.State == schema.JobStateRunning {
ttl = 2 * time.Minute
}
// FIXME: Review: Is this really necessary or correct.
// Note: Lines 147-170 formerly known as prepareJobData(jobData, scopes)
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
// to be available at the scope 'node'. If a job has a lot of nodes,
// statisticsSeries should be available so that a min/median/max Graph can be
// used instead of a lot of single lines.
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
// Existing (archived) StatsSeries can be 'min/mean/max'!
const maxSeriesSize int = 15
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
nodeScopeRequested := false
for _, scope := range scopes {
if scope == schema.MetricScopeNode {
nodeScopeRequested = true
}
}
if nodeScopeRequested {
jd.AddNodeScope("flops_any")
jd.AddNodeScope("mem_bw")
}
// Round Resulting Stat Values
jd.RoundMetricStats()
return jd, ttl, size
})
if err, ok := data.(error); ok {
log.Error("Error in returned dataset")
return nil, err
}
return data.(schema.JobData), nil
}
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
func LoadAverages(
job *schema.Job,
metrics []string,
data [][]schema.Float,
ctx context.Context,
) error {
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
}
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
if err != nil {
log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
return err
}
for i, m := range metrics {
nodes, ok := stats[m]
if !ok {
data[i] = append(data[i], schema.NaN)
continue
}
sum := 0.0
for _, node := range nodes {
sum += node.Avg
}
data[i] = append(data[i], schema.Float(sum))
}
return nil
}
// Used for polar plots in frontend
func LoadStatData(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]schema.MetricStatistics, error) {
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
return archive.LoadStatsFromArchive(job, metrics)
}
data := make(map[string]schema.MetricStatistics, len(metrics))
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return data, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx)
if err != nil {
log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
return data, err
}
for _, m := range metrics {
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
nodes, ok := stats[m]
if !ok {
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
continue
}
for _, node := range nodes {
sum += node.Avg
min = math.Min(min, node.Min)
max = math.Max(max, node.Max)
}
data[m] = schema.MetricStatistics{
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
Min: (math.Round(min*100) / 100),
Max: (math.Round(max*100) / 100),
}
}
return data, nil
}
// Used for the classic node/system view. Returns a map of nodes to a map of metrics.
func LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
repo, err := metricdata.GetMetricDataRepo(cluster)
if err != nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
}
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
log.Warnf("partial error: %s", err.Error())
} else {
log.Error("Error while loading node data from metric repository")
return nil, err
}
}
if data == nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
}
return data, nil
}
func LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]schema.JobData, int, bool, error) {
repo, err := metricdata.GetMetricDataRepo(cluster)
if err != nil {
return nil, 0, false, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
}
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, totalNodes, hasNextPage, err := repo.LoadNodeListData(cluster, subCluster, nodeFilter, metrics, scopes, resolution, from, to, page, ctx)
if err != nil {
if len(data) != 0 {
log.Warnf("partial error: %s", err.Error())
} else {
log.Error("Error while loading node data from metric repository")
return nil, totalNodes, hasNextPage, err
}
}
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
const maxSeriesSize int = 8
for _, jd := range data {
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
}
if data == nil {
return nil, totalNodes, hasNextPage, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
}
return data, totalNodes, hasNextPage, nil
}

View File

@@ -1,333 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
)
type InfluxDBv2DataRepositoryConfig struct {
Url string `json:"url"`
Token string `json:"token"`
Bucket string `json:"bucket"`
Org string `json:"org"`
SkipTls bool `json:"skiptls"`
}
type InfluxDBv2DataRepository struct {
client influxdb2.Client
queryClient influxdb2Api.QueryAPI
bucket, measurement string
}
func (idb *InfluxDBv2DataRepository) Init(rawConfig json.RawMessage) error {
var config InfluxDBv2DataRepositoryConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
log.Warn("Error while unmarshaling raw json config")
return err
}
idb.client = influxdb2.NewClientWithOptions(config.Url, config.Token, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: config.SkipTls}))
idb.queryClient = idb.client.QueryAPI(config.Org)
idb.bucket = config.Bucket
return nil
}
func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string {
return t.Format(time.RFC3339) // Like “2006-01-02T15:04:05Z07:00”
}
func (idb *InfluxDBv2DataRepository) epochToTime(epoch int64) time.Time {
return time.Unix(epoch, 0)
}
func (idb *InfluxDBv2DataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int) (schema.JobData, error) {
measurementsConds := make([]string, 0, len(metrics))
for _, m := range metrics {
measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m))
}
measurementsCond := strings.Join(measurementsConds, " or ")
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO
return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
jobData := make(schema.JobData) // Empty Schema: map[<string>FIELD]map[<MetricScope>SCOPE]<*JobMetric>METRIC
// Requested Scopes
for _, scope := range scopes {
query := ""
switch scope {
case "node":
// Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows
// log.Info("Scope 'node' requested. ")
query = fmt.Sprintf(`
from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => (%s) and (%s) )
|> drop(columns: ["_start", "_stop"])
|> group(columns: ["hostname", "_measurement"])
|> aggregateWindow(every: 60s, fn: mean)
|> drop(columns: ["_time"])`,
idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
measurementsCond, hostsCond)
case "socket":
log.Info("Scope 'socket' requested, but not yet supported: Will return 'node' scope only. ")
continue
case "core":
log.Info(" Scope 'core' requested, but not yet supported: Will return 'node' scope only. ")
continue
// Get Finest Granularity only, Set NULL to 0.0
// query = fmt.Sprintf(`
// from(bucket: "%s")
// |> range(start: %s, stop: %s)
// |> filter(fn: (r) => %s )
// |> filter(fn: (r) => %s )
// |> drop(columns: ["_start", "_stop", "cluster"])
// |> map(fn: (r) => (if exists r._value then {r with _value: r._value} else {r with _value: 0.0}))`,
// idb.bucket,
// idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )),
// measurementsCond, hostsCond)
default:
log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope)
continue
// return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support other scopes than 'node'")
}
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
log.Error("Error while performing query")
return nil, err
}
// Init Metrics: Only Node level now -> TODO: Matching /check on scope level ...
for _, metric := range metrics {
jobMetric, ok := jobData[metric]
if !ok {
mc := archive.GetMetricConfig(job.Cluster, metric)
jobMetric = map[schema.MetricScope]*schema.JobMetric{
scope: { // uses scope var from above!
Unit: mc.Unit,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0, len(job.Resources)),
StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
},
}
}
jobData[metric] = jobMetric
}
// Process Result: Time-Data
field, host, hostSeries := "", "", schema.Series{}
// typeId := 0
switch scope {
case "node":
for rows.Next() {
row := rows.Record()
if host == "" || host != row.ValueByKey("hostname").(string) || rows.TableChanged() {
if host != "" {
// Append Series before reset
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
}
field, host = row.Measurement(), row.ValueByKey("hostname").(string)
hostSeries = schema.Series{
Hostname: host,
Statistics: schema.MetricStatistics{}, //TODO Add Statistics
Data: make([]schema.Float, 0),
}
}
val, ok := row.Value().(float64)
if ok {
hostSeries.Data = append(hostSeries.Data, schema.Float(val))
} else {
hostSeries.Data = append(hostSeries.Data, schema.Float(0))
}
}
case "socket":
continue
case "core":
continue
// Include Series.Id in hostSeries
// for rows.Next() {
// row := rows.Record()
// if ( host == "" || host != row.ValueByKey("hostname").(string) || typeId != row.ValueByKey("type-id").(int) || rows.TableChanged() ) {
// if ( host != "" ) {
// // Append Series before reset
// jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
// }
// field, host, typeId = row.Measurement(), row.ValueByKey("hostname").(string), row.ValueByKey("type-id").(int)
// hostSeries = schema.Series{
// Hostname: host,
// Id: &typeId,
// Statistics: nil,
// Data: make([]schema.Float, 0),
// }
// }
// val := row.Value().(float64)
// hostSeries.Data = append(hostSeries.Data, schema.Float(val))
// }
default:
log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope)
continue
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node, core'")
}
// Append last Series
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
}
// Get Stats
stats, err := idb.LoadStats(job, metrics, ctx)
if err != nil {
log.Warn("Error while loading statistics")
return nil, err
}
for _, scope := range scopes {
if scope == "node" { // No 'socket/core' support yet
for metric, nodes := range stats {
for node, stats := range nodes {
for index, _ := range jobData[metric][scope].Series {
if jobData[metric][scope].Series[index].Hostname == node {
jobData[metric][scope].Series[index].Statistics = schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
}
}
}
}
}
}
return jobData, nil
}
func (idb *InfluxDBv2DataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
stats := map[string]map[string]schema.MetricStatistics{}
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO
return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
// lenMet := len(metrics)
for _, metric := range metrics {
// log.Debugf("<< You are here: %s (Index %d of %d metrics)", metric, index, lenMet)
query := fmt.Sprintf(`
data = from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => r._measurement == "%s" and r._field == "value" and (%s))
union(tables: [data |> mean(column: "_value") |> set(key: "_field", value: "avg"),
data |> min(column: "_value") |> set(key: "_field", value: "min"),
data |> max(column: "_value") |> set(key: "_field", value: "max")])
|> pivot(rowKey: ["hostname"], columnKey: ["_field"], valueColumn: "_value")
|> group()`,
idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
metric, hostsCond)
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
log.Error("Error while performing query")
return nil, err
}
nodes := map[string]schema.MetricStatistics{}
for rows.Next() {
row := rows.Record()
host := row.ValueByKey("hostname").(string)
avg, avgok := row.ValueByKey("avg").(float64)
if !avgok {
// log.Debugf(">> Assertion error for metric %s, statistic AVG. Expected 'float64', got %v", metric, avg)
avg = 0.0
}
min, minok := row.ValueByKey("min").(float64)
if !minok {
// log.Debugf(">> Assertion error for metric %s, statistic MIN. Expected 'float64', got %v", metric, min)
min = 0.0
}
max, maxok := row.ValueByKey("max").(float64)
if !maxok {
// log.Debugf(">> Assertion error for metric %s, statistic MAX. Expected 'float64', got %v", metric, max)
max = 0.0
}
nodes[host] = schema.MetricStatistics{
Avg: avg,
Min: min,
Max: max,
}
}
stats[metric] = nodes
}
return stats, nil
}
func (idb *InfluxDBv2DataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
// TODO : Implement to be used in Analysis- und System/Node-View
log.Infof("LoadNodeData unimplemented for InfluxDBv2DataRepository, Args: cluster %s, metrics %v, nodes %v, scopes %v", cluster, metrics, nodes, scopes)
return nil, errors.New("METRICDATA/INFLUXV2 > unimplemented for InfluxDBv2DataRepository")
}
func (idb *InfluxDBv2DataRepository) LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]schema.JobData, int, bool, error) {
var totalNodes int = 0
var hasNextPage bool = false
// TODO : Implement to be used in NodeList-View
log.Infof("LoadNodeListData unimplemented for InfluxDBv2DataRepository, Args: cluster %s, metrics %v, nodeFilter %v, scopes %v", cluster, metrics, nodeFilter, scopes)
return nil, totalNodes, hasNextPage, errors.New("METRICDATA/INFLUXV2 > unimplemented for InfluxDBv2DataRepository")
}

View File

@@ -1,83 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type MetricDataRepository interface {
// Initialize this MetricDataRepository. One instance of
// this interface will only ever be responsible for one cluster.
Init(rawConfig json.RawMessage) error
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
LoadNodeListData(cluster, subCluster, nodeFilter string, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, page *model.PageRequest, ctx context.Context) (map[string]schema.JobData, int, bool, error)
}
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
func Init() error {
for _, cluster := range config.Keys.Clusters {
if cluster.MetricDataRepository != nil {
var kind struct {
Kind string `json:"kind"`
}
if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
log.Warn("Error while unmarshaling raw json MetricDataRepository")
return err
}
var mdr MetricDataRepository
switch kind.Kind {
case "cc-metric-store":
mdr = &CCMetricStore{}
case "influxdb":
mdr = &InfluxDBv2DataRepository{}
case "prometheus":
mdr = &PrometheusDataRepository{}
case "test":
mdr = &TestMetricDataRepository{}
default:
return fmt.Errorf("METRICDATA/METRICDATA > Unknown MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
}
if err := mdr.Init(cluster.MetricDataRepository); err != nil {
log.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
return err
}
metricDataRepos[cluster.Name] = mdr
}
}
return nil
}
func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
var err error
repo, ok := metricDataRepos[cluster]
if !ok {
err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
}
return repo, err
}

View File

@@ -1,467 +0,0 @@
// Copyright (C) 2022 DKRZ
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"math"
"net/http"
"os"
"regexp"
"sort"
"strings"
"sync"
"text/template"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
promapi "github.com/prometheus/client_golang/api"
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
promcfg "github.com/prometheus/common/config"
promm "github.com/prometheus/common/model"
)
type PrometheusDataRepositoryConfig struct {
Url string `json:"url"`
Username string `json:"username,omitempty"`
Suffix string `json:"suffix,omitempty"`
Templates map[string]string `json:"query-templates"`
}
type PrometheusDataRepository struct {
client promapi.Client
queryClient promv1.API
suffix string
templates map[string]*template.Template
}
type PromQLArgs struct {
Nodes string
}
type Trie map[rune]Trie
var logOnce sync.Once
func contains(s []schema.MetricScope, str schema.MetricScope) bool {
for _, v := range s {
if v == str {
return true
}
}
return false
}
func MinMaxMean(data []schema.Float) (float64, float64, float64) {
if len(data) == 0 {
return 0.0, 0.0, 0.0
}
min := math.MaxFloat64
max := -math.MaxFloat64
var sum float64
var n float64
for _, val := range data {
if val.IsNaN() {
continue
}
sum += float64(val)
n += 1
if float64(val) > max {
max = float64(val)
}
if float64(val) < min {
min = float64(val)
}
}
return min, max, sum / n
}
// Rewritten from
// https://github.com/ermanh/trieregex/blob/master/trieregex/trieregex.py
func nodeRegex(nodes []string) string {
root := Trie{}
// add runes of each compute node to trie
for _, node := range nodes {
_trie := root
for _, c := range node {
if _, ok := _trie[c]; !ok {
_trie[c] = Trie{}
}
_trie = _trie[c]
}
_trie['*'] = Trie{}
}
// recursively build regex from rune trie
var trieRegex func(trie Trie, reset bool) string
trieRegex = func(trie Trie, reset bool) string {
if reset == true {
trie = root
}
if len(trie) == 0 {
return ""
}
if len(trie) == 1 {
for key, _trie := range trie {
if key == '*' {
return ""
}
return regexp.QuoteMeta(string(key)) + trieRegex(_trie, false)
}
} else {
sequences := []string{}
for key, _trie := range trie {
if key != '*' {
sequences = append(sequences, regexp.QuoteMeta(string(key))+trieRegex(_trie, false))
}
}
sort.Slice(sequences, func(i, j int) bool {
return (-len(sequences[i]) < -len(sequences[j])) || (sequences[i] < sequences[j])
})
var result string
// single edge from this tree node
if len(sequences) == 1 {
result = sequences[0]
if len(result) > 1 {
result = "(?:" + result + ")"
}
// multiple edges, each length 1
} else if s := strings.Join(sequences, ""); len(s) == len(sequences) {
// char or numeric range
if len(s)-1 == int(s[len(s)-1])-int(s[0]) {
result = fmt.Sprintf("[%c-%c]", s[0], s[len(s)-1])
// char or numeric set
} else {
result = "[" + s + "]"
}
// multiple edges of different lengths
} else {
result = "(?:" + strings.Join(sequences, "|") + ")"
}
if _, ok := trie['*']; ok {
result += "?"
}
return result
}
return ""
}
return trieRegex(root, true)
}
func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
var config PrometheusDataRepositoryConfig
// parse config
if err := json.Unmarshal(rawConfig, &config); err != nil {
log.Warn("Error while unmarshaling raw json config")
return err
}
// support basic authentication
var rt http.RoundTripper = nil
if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
prom_pw := promcfg.Secret(prom_pw)
rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
} else {
if config.Username != "" {
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
}
}
// init client
client, err := promapi.NewClient(promapi.Config{
Address: config.Url,
RoundTripper: rt,
})
if err != nil {
log.Error("Error while initializing new prometheus client")
return err
}
// init query client
pdb.client = client
pdb.queryClient = promv1.NewAPI(pdb.client)
// site config
pdb.suffix = config.Suffix
// init query templates
pdb.templates = make(map[string]*template.Template)
for metric, templ := range config.Templates {
pdb.templates[metric], err = template.New(metric).Parse(templ)
if err == nil {
log.Debugf("Added PromQL template for %s: %s", metric, templ)
} else {
log.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
}
}
return nil
}
// TODO: respect scope argument
func (pdb *PrometheusDataRepository) FormatQuery(
metric string,
scope schema.MetricScope,
nodes []string,
cluster string,
) (string, error) {
args := PromQLArgs{}
if len(nodes) > 0 {
args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
} else {
args.Nodes = fmt.Sprintf(".*%s", pdb.suffix)
}
buf := &bytes.Buffer{}
if templ, ok := pdb.templates[metric]; ok {
err := templ.Execute(buf, args)
if err != nil {
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > Error compiling template %v", templ))
} else {
query := buf.String()
log.Debugf("PromQL: %s", query)
return query, nil
}
} else {
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > No PromQL for metric %s configured.", metric))
}
}
// Convert PromAPI row to CC schema.Series
func (pdb *PrometheusDataRepository) RowToSeries(
from time.Time,
step int64,
steps int64,
row *promm.SampleStream,
) schema.Series {
ts := from.Unix()
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
// init array of expected length with NaN
values := make([]schema.Float, steps+1)
for i := range values {
values[i] = schema.NaN
}
// copy recorded values from prom sample pair
for _, v := range row.Values {
idx := (v.Timestamp.Unix() - ts) / step
values[idx] = schema.Float(v.Value)
}
min, max, mean := MinMaxMean(values)
// output struct
return schema.Series{
Hostname: hostname,
Data: values,
Statistics: schema.MetricStatistics{
Avg: mean,
Min: min,
Max: max,
},
}
}
func (pdb *PrometheusDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
// TODO respect requested scope
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
jobData := make(schema.JobData)
// parse job specs
nodes := make([]string, len(job.Resources))
for i, resource := range job.Resources {
nodes[i] = resource.Hostname
}
from := job.StartTime
to := job.StartTime.Add(time.Duration(job.Duration) * time.Second)
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
log.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(job.Cluster, metric)
if metricConfig == nil {
log.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, job.Cluster)
if err != nil {
log.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all job nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
log.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
log.Warnf("Warnings: %v\n", warnings)
}
// init data structures
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
jobMetric, ok := jobData[metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: make([]schema.Series, 0),
}
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
jobMetric.Series = append(jobMetric.Series,
pdb.RowToSeries(from, step, steps, row))
}
// only add metric if at least one host returned data
if !ok && len(jobMetric.Series) > 0 {
jobData[metric][scope] = jobMetric
}
// sort by hostname to get uniform coloring
sort.Slice(jobMetric.Series, func(i, j int) bool {
return (jobMetric.Series[i].Hostname < jobMetric.Series[j].Hostname)
})
}
}
return jobData, nil
}
// TODO change implementation to precomputed/cached stats
func (pdb *PrometheusDataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
// map of metrics of nodes of stats
stats := map[string]map[string]schema.MetricStatistics{}
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
log.Warn("Error while loading job for stats")
return nil, err
}
for metric, metricData := range data {
stats[metric] = make(map[string]schema.MetricStatistics)
for _, series := range metricData[schema.MetricScopeNode].Series {
stats[metric][series.Hostname] = series.Statistics
}
}
return stats, nil
}
func (pdb *PrometheusDataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
t0 := time.Now()
// Map of hosts of metrics of value slices
data := make(map[string]map[string][]*schema.JobMetric)
// query db for each metric
// TODO: scopes seems to be always empty
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
log.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(cluster, metric)
if metricConfig == nil {
log.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
if err != nil {
log.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
log.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
log.Warnf("Warnings: %v\n", warnings)
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
hostdata, ok := data[hostname]
if !ok {
hostdata = make(map[string][]*schema.JobMetric)
data[hostname] = hostdata
}
// output per host and metric
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
},
)
}
}
}
t1 := time.Since(t0)
log.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
return data, nil
}
func (pdb *PrometheusDataRepository) LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]schema.JobData, int, bool, error) {
var totalNodes int = 0
var hasNextPage bool = false
// TODO : Implement to be used in NodeList-View
log.Infof("LoadNodeListData unimplemented for PrometheusDataRepository, Args: cluster %s, metrics %v, nodeFilter %v, scopes %v", cluster, metrics, nodeFilter, scopes)
return nil, totalNodes, hasNextPage, errors.New("METRICDATA/INFLUXV2 > unimplemented for PrometheusDataRepository")
}

View File

@@ -1,111 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"encoding/json"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
panic("TODO")
}
// Only a mock for unit-testing.
type TestMetricDataRepository struct{}
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
return nil
}
func (tmdr *TestMetricDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int) (schema.JobData, error) {
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
}
func (tmdr *TestMetricDataRepository) LoadStats(
job *schema.Job,
metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]schema.JobData, int, bool, error) {
panic("TODO")
}
func DeepCopy(jd_temp schema.JobData) schema.JobData {
var jd schema.JobData
jd = make(schema.JobData, len(jd_temp))
for k, v := range jd_temp {
jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jd_temp[k]))
for k_, v_ := range v {
jd[k][k_] = new(schema.JobMetric)
jd[k][k_].Series = make([]schema.Series, len(v_.Series))
for i := 0; i < len(v_.Series); i += 1 {
jd[k][k_].Series[i].Data = make([]schema.Float, len(v_.Series[i].Data))
copy(jd[k][k_].Series[i].Data, v_.Series[i].Data)
jd[k][k_].Series[i].Hostname = v_.Series[i].Hostname
jd[k][k_].Series[i].Id = v_.Series[i].Id
jd[k][k_].Series[i].Statistics.Avg = v_.Series[i].Statistics.Avg
jd[k][k_].Series[i].Statistics.Min = v_.Series[i].Statistics.Min
jd[k][k_].Series[i].Statistics.Max = v_.Series[i].Statistics.Max
}
jd[k][k_].Timestep = v_.Timestep
jd[k][k_].Unit.Base = v_.Unit.Base
jd[k][k_].Unit.Prefix = v_.Unit.Prefix
if v_.StatisticsSeries != nil {
// Init Slices
jd[k][k_].StatisticsSeries = new(schema.StatsSeries)
jd[k][k_].StatisticsSeries.Max = make([]schema.Float, len(v_.StatisticsSeries.Max))
jd[k][k_].StatisticsSeries.Min = make([]schema.Float, len(v_.StatisticsSeries.Min))
jd[k][k_].StatisticsSeries.Median = make([]schema.Float, len(v_.StatisticsSeries.Median))
jd[k][k_].StatisticsSeries.Mean = make([]schema.Float, len(v_.StatisticsSeries.Mean))
// Copy Data
copy(jd[k][k_].StatisticsSeries.Max, v_.StatisticsSeries.Max)
copy(jd[k][k_].StatisticsSeries.Min, v_.StatisticsSeries.Min)
copy(jd[k][k_].StatisticsSeries.Median, v_.StatisticsSeries.Median)
copy(jd[k][k_].StatisticsSeries.Mean, v_.StatisticsSeries.Mean)
// Handle Percentiles
for k__, v__ := range v_.StatisticsSeries.Percentiles {
jd[k][k_].StatisticsSeries.Percentiles[k__] = make([]schema.Float, len(v__))
copy(jd[k][k_].StatisticsSeries.Percentiles[k__], v__)
}
} else {
jd[k][k_].StatisticsSeries = v_.StatisticsSeries
}
}
}
return jd
}

View File

@@ -0,0 +1,29 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdispatch
const configSchema = `{
"type": "array",
"description": "Array of metric store configurations with scope-based routing.",
"items": {
"type": "object",
"properties": {
"scope": {
"description": "Scope identifier for routing metrics (e.g., cluster name, '*' for default)",
"type": "string"
},
"url": {
"description": "URL of the metric store endpoint",
"type": "string"
},
"token": {
"description": "Authentication token for the metric store",
"type": "string"
}
},
"required": ["scope", "url", "token"]
}
}`

View File

@@ -0,0 +1,533 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricdispatch provides a unified interface for loading and caching job metric data.
//
// This package serves as a central dispatcher that routes metric data requests to the appropriate
// backend based on job state. For running jobs, data is fetched from the metric store (e.g., cc-metric-store).
// For completed jobs, data is retrieved from the file-based job archive.
//
// # Key Features
//
// - Automatic backend selection based on job state (running vs. archived)
// - LRU cache for performance optimization (128 MB default cache size)
// - Data resampling using Largest Triangle Three Bucket algorithm for archived data
// - Automatic statistics series generation for jobs with many nodes
// - Support for scoped metrics (node, socket, accelerator, core)
//
// # Cache Behavior
//
// Cached data has different TTL (time-to-live) values depending on job state:
// - Running jobs: 2 minutes (data changes frequently)
// - Completed jobs: 5 hours (data is static)
//
// The cache key is based on job ID, state, requested metrics, scopes, and resolution.
//
// # Usage
//
// The primary entry point is LoadData, which automatically handles both running and archived jobs:
//
// jobData, err := metricdispatch.LoadData(job, metrics, scopes, ctx, resolution)
// if err != nil {
// // Handle error
// }
//
// For statistics only, use LoadJobStats, LoadScopedJobStats, or LoadAverages depending on the required format.
package metricdispatch
import (
"context"
"fmt"
"math"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/lrucache"
"github.com/ClusterCockpit/cc-lib/v2/resampler"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// cache is an LRU cache with 128 MB capacity for storing loaded job metric data.
// The cache reduces load on both the metric store and archive backends.
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
// cacheKey generates a unique cache key for a job's metric data based on job ID, state,
// requested metrics, scopes, and resolution. Duration and StartTime are intentionally excluded
// because job.ID is more unique and the cache TTL ensures entries don't persist indefinitely.
func cacheKey(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) string {
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
*job.ID, job.State, metrics, scopes, resolution)
}
// LoadData retrieves metric data for a job from the appropriate backend (memory store for running jobs,
// archive for completed jobs) and applies caching, resampling, and statistics generation as needed.
//
// For running jobs or when archive is disabled, data is fetched from the metric store.
// For completed archived jobs, data is loaded from the job archive and resampled if needed.
//
// Parameters:
// - job: The job for which to load metric data
// - metrics: List of metric names to load (nil loads all metrics for the cluster)
// - scopes: Metric scopes to include (nil defaults to node scope)
// - ctx: Context for cancellation and timeouts
// - resolution: Target number of data points for resampling (only applies to archived data)
//
// Returns the loaded job data and any error encountered. For partial errors (some metrics failed),
// the function returns the successfully loaded data with a warning logged.
func LoadData(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
var jd schema.JobData
var err error
if job.State == schema.JobStateRunning ||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving {
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return err, 0, 0
}
if scopes == nil {
scopes = append(scopes, schema.MetricScopeNode)
}
if metrics == nil {
cluster := archive.GetCluster(job.Cluster)
for _, mc := range cluster.MetricConfig {
metrics = append(metrics, mc.Name)
}
}
jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil {
if len(jd) != 0 {
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
} else {
cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0
}
}
size = jd.Size()
} else {
var jdTemp schema.JobData
jdTemp, err = archive.GetHandle().LoadJobData(job)
if err != nil {
cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0
}
jd = deepCopy(jdTemp)
// Resample archived data using Largest Triangle Three Bucket algorithm to reduce data points
// to the requested resolution, improving transfer performance and client-side rendering.
for _, v := range jd {
for _, v_ := range v {
timestep := int64(0)
for i := 0; i < len(v_.Series); i += 1 {
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
if err != nil {
return err, 0, 0
}
}
v_.Timestep = int(timestep)
}
}
// Filter job data to only include requested metrics and scopes, avoiding unnecessary data transfer.
if metrics != nil || scopes != nil {
if metrics == nil {
metrics = make([]string, 0, len(jd))
for k := range jd {
metrics = append(metrics, k)
}
}
res := schema.JobData{}
for _, metric := range metrics {
if perscope, ok := jd[metric]; ok {
if len(perscope) > 1 {
subset := make(map[schema.MetricScope]*schema.JobMetric)
for _, scope := range scopes {
if jm, ok := perscope[scope]; ok {
subset[scope] = jm
}
}
if len(subset) > 0 {
perscope = subset
}
}
res[metric] = perscope
}
}
jd = res
}
size = jd.Size()
}
ttl = 5 * time.Hour
if job.State == schema.JobStateRunning {
ttl = 2 * time.Minute
}
// Generate statistics series for jobs with many nodes to enable min/median/max graphs
// instead of overwhelming the UI with individual node lines. Note that newly calculated
// statistics use min/median/max, while archived statistics may use min/mean/max.
const maxSeriesSize int = 15
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
nodeScopeRequested := false
for _, scope := range scopes {
if scope == schema.MetricScopeNode {
nodeScopeRequested = true
}
}
if nodeScopeRequested {
jd.AddNodeScope("flops_any")
jd.AddNodeScope("mem_bw")
}
// Round Resulting Stat Values
jd.RoundMetricStats()
return jd, ttl, size
})
if err, ok := data.(error); ok {
cclog.Errorf("error in cached dataset for job %d: %s", job.JobID, err.Error())
return nil, err
}
return data.(schema.JobData), nil
}
// LoadAverages computes average values for the specified metrics across all nodes of a job.
// For running jobs, it loads statistics from the metric store. For completed jobs, it uses
// the pre-calculated averages from the job archive. The results are appended to the data slice.
func LoadAverages(
job *schema.Job,
metrics []string,
data [][]schema.Float,
ctx context.Context,
) error {
if job.State != schema.JobStateRunning {
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
}
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return err
}
stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err
}
for i, m := range metrics {
nodes, ok := stats[m]
if !ok {
data[i] = append(data[i], schema.NaN)
continue
}
sum := 0.0
for _, node := range nodes {
sum += node.Avg
}
data[i] = append(data[i], schema.Float(sum))
}
return nil
}
// LoadScopedJobStats retrieves job statistics organized by metric scope (node, socket, core, accelerator).
// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
// statistics are loaded from the job archive.
func LoadScopedJobStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
if job.State != schema.JobStateRunning {
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
}
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return nil, err
}
scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return nil, err
}
// Round Resulting Stat Values
scopedStats.RoundScopedMetricStats()
return scopedStats, nil
}
// LoadJobStats retrieves aggregated statistics (min/avg/max) for each requested metric across all job nodes.
// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
// statistics are loaded from the job archive.
func LoadJobStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]schema.MetricStatistics, error) {
if job.State != schema.JobStateRunning {
return archive.LoadStatsFromArchive(job, metrics)
}
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return nil, err
}
data := make(map[string]schema.MetricStatistics, len(metrics))
stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return data, err
}
for _, m := range metrics {
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
nodes, ok := stats[m]
if !ok {
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
continue
}
for _, node := range nodes {
sum += node.Avg
min = math.Min(min, node.Min)
max = math.Max(max, node.Max)
}
data[m] = schema.MetricStatistics{
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
Min: (math.Round(min*100) / 100),
Max: (math.Round(max*100) / 100),
}
}
return data, nil
}
// LoadNodeData retrieves metric data for specific nodes in a cluster within a time range.
// This is used for node monitoring views and system status pages. Data is always fetched from
// the metric store (not the archive) since it's for current/recent node status monitoring.
//
// Returns a nested map structure: node -> metric -> scoped data.
// FIXME: Add support for subcluster specific cc-metric-stores
func LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
ms, err := GetMetricDataRepo(cluster, "")
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s: %s",
cluster, err.Error())
return nil, err
}
data, err := ms.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
} else {
cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
return nil, err
}
}
if data == nil {
return nil, fmt.Errorf("metric store for cluster '%s' does not support node data queries", cluster)
}
return data, nil
}
// LoadNodeListData retrieves time-series metric data for multiple nodes within a time range,
// with optional resampling and automatic statistics generation for large datasets.
// This is used for comparing multiple nodes or displaying node status over time.
//
// Returns a map of node names to their job-like metric data structures.
func LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
ms, err := GetMetricDataRepo(cluster, subCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
cluster, subCluster, err.Error())
return nil, err
}
data, err := ms.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error())
} else {
cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error())
return nil, err
}
}
// Generate statistics series for datasets with many series to improve visualization performance.
// Statistics are calculated as min/median/max.
const maxSeriesSize int = 8
for _, jd := range data {
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
}
if data == nil {
return nil, fmt.Errorf("metric store for cluster '%s' does not support node list queries", cluster)
}
return data, nil
}
// deepCopy creates a deep copy of JobData to prevent cache corruption when modifying
// archived data (e.g., during resampling). This ensures the cached archive data remains
// immutable while allowing per-request transformations.
func deepCopy(source schema.JobData) schema.JobData {
result := make(schema.JobData, len(source))
for metricName, scopeMap := range source {
result[metricName] = make(map[schema.MetricScope]*schema.JobMetric, len(scopeMap))
for scope, jobMetric := range scopeMap {
result[metricName][scope] = copyJobMetric(jobMetric)
}
}
return result
}
func copyJobMetric(src *schema.JobMetric) *schema.JobMetric {
dst := &schema.JobMetric{
Timestep: src.Timestep,
Unit: src.Unit,
Series: make([]schema.Series, len(src.Series)),
}
for i := range src.Series {
dst.Series[i] = copySeries(&src.Series[i])
}
if src.StatisticsSeries != nil {
dst.StatisticsSeries = copyStatisticsSeries(src.StatisticsSeries)
}
return dst
}
func copySeries(src *schema.Series) schema.Series {
dst := schema.Series{
Hostname: src.Hostname,
ID: src.ID,
Statistics: src.Statistics,
Data: make([]schema.Float, len(src.Data)),
}
copy(dst.Data, src.Data)
return dst
}
func copyStatisticsSeries(src *schema.StatsSeries) *schema.StatsSeries {
dst := &schema.StatsSeries{
Min: make([]schema.Float, len(src.Min)),
Mean: make([]schema.Float, len(src.Mean)),
Median: make([]schema.Float, len(src.Median)),
Max: make([]schema.Float, len(src.Max)),
}
copy(dst.Min, src.Min)
copy(dst.Mean, src.Mean)
copy(dst.Median, src.Median)
copy(dst.Max, src.Max)
if len(src.Percentiles) > 0 {
dst.Percentiles = make(map[int][]schema.Float, len(src.Percentiles))
for percentile, values := range src.Percentiles {
dst.Percentiles[percentile] = make([]schema.Float, len(values))
copy(dst.Percentiles[percentile], values)
}
}
return dst
}

Some files were not shown because too many files have changed in this diff Show More