mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-17 21:37:30 +01:00
Compare commits
1361 Commits
v1.3.1
...
optimize-d
| Author | SHA1 | Date | |
|---|---|---|---|
|
dc8e8b9095
|
|||
|
2080e268ce
|
|||
|
e8204ff1a2
|
|||
|
09fa239e8b
|
|||
|
301e590580
|
|||
|
eba3995610
|
|||
|
f8831e7040
|
|||
|
1cf99206a9
|
|||
|
5d3d77620e
|
|||
|
|
5c72664162 | ||
|
|
f3e796f3f5 | ||
|
|
cc38b17472 | ||
| 282197ebef | |||
|
|
d2bc046fc6 | ||
|
|
d0ebba5b4a | ||
| 70fea39d03 | |||
|
|
88bd83b07e | ||
|
|
d74465215d | ||
|
|
0fb9dc0373 | ||
|
|
2c519ab2dc | ||
|
|
0b89d6a084 | ||
| ddda341e10 | |||
|
|
8ba1eb533a | ||
| 47181330e9 | |||
| 845d0111af | |||
| 26982088c3 | |||
|
|
9672903d41 | ||
| 67a17b5306 | |||
| 75591eb034 | |||
| 33ec755422 | |||
| 2b74ac822f | |||
| cc0403e2a4 | |||
|
|
db625239ea | ||
|
|
a72581727d | ||
| 87425c0b09 | |||
| 39635ea123 | |||
|
|
1d3e7e5b2f | ||
|
|
84fda9c8e2 | ||
|
|
f11ff3302d | ||
|
|
4e1b00a032 | ||
|
6da41982ce
|
|||
|
763e0c8d7c
|
|||
|
b13e1628a0
|
|||
|
5669eb5818
|
|||
|
|
74ab51f409 | ||
|
|
8b0f63b1f8 | ||
| 688ad507a2 | |||
| 15be664ad8 | |||
| 22c442db5b | |||
|
|
38bb2dd4ec | ||
|
|
32fd18543a | ||
|
|
718ff60221 | ||
|
|
a243e17499 | ||
| 3d5a124321 | |||
| 1ec41d8389 | |||
| 888d7fb235 | |||
| adebffd251 | |||
|
|
d00aa2666d | ||
|
|
2e5d85c223 | ||
|
|
07b989cb81 | ||
| a418abc7d5 | |||
| a1db8263d7 | |||
| 4c3cd8e66a | |||
| 6ecb934967 | |||
|
|
348b6010e8 | ||
| ca0f9a42c7 | |||
|
|
0a0db36433 | ||
| 92236b1d1d | |||
|
|
df3bc111a4 | ||
| cc21e0e62c | |||
|
|
a6e23dd52e | ||
| 45a1bc78b7 | |||
|
|
8bacffbd3e | ||
|
|
ff180affd7 | ||
|
|
248b923980 | ||
| 5d136634a2 | |||
|
|
ac5ee1564a | ||
| 998aff2345 | |||
|
|
0dea959391 | ||
| dadcb983e7 | |||
| 8989b7a410 | |||
|
|
2b56e02a3e | ||
|
|
bcfe9022de | ||
|
|
31f3c28294 | ||
|
|
12c01655c3 | ||
| ab55ce91a1 | |||
| 03c65e06f6 | |||
| defa8fa994 | |||
| c9d8de0d56 | |||
|
|
2b788f14ec | ||
| 86fbecc679 | |||
| 8ee6c09e9b | |||
| fc1ba1f5b3 | |||
| 82e79b074a | |||
|
|
bae7ec11b4 | ||
|
|
2d90fd05d6 | ||
|
e1c1148160
|
|||
|
|
5ee3bbdbf5 | ||
|
dc161ec421
|
|||
|
abd11d783b
|
|||
|
|
39c919bb0c | ||
|
064aa0a238
|
|||
|
f00f9fcee0
|
|||
|
|
705d70ddc0 | ||
|
|
7789489d08 | ||
|
|
62cd21eb83 | ||
|
90b52f997d
|
|||
|
745c0357f3
|
|||
|
|
087c00eb7f | ||
|
|
facfc9d4c9 | ||
|
|
89e06c3530 | ||
|
57536d982c
|
|||
|
|
0c0f423b84 | ||
|
7bd79dcc3c
|
|||
|
2a659915a4
|
|||
|
|
c77e2969c5 | ||
|
d8ad6dd3f0
|
|||
|
cd8b574cca
|
|||
|
2da35909c1
|
|||
|
2e24fde430
|
|||
|
757be60b22
|
|||
|
589149790f
|
|||
|
29c440a637
|
|||
|
6035b62734
|
|||
|
|
fe3ebe0abc | ||
|
|
a52d7d017f | ||
|
9af44779aa
|
|||
|
|
1cf2c41bd7 | ||
|
|
2eeefc2720 | ||
|
|
0dd894890f | ||
| 25ff094bdf | |||
| d2ff4a2e02 | |||
| 4b983e3b9b | |||
| 98f9c18f72 | |||
| 7d8b305cd9 | |||
| 6e3462f962 | |||
| 2c8608f5a4 | |||
|
|
3215bc3de0 | ||
| 140c7f6e47 | |||
| c15f1117f5 | |||
|
|
48729b172d | ||
|
|
76ce8122e2 | ||
| f016bd4232 | |||
| 54ea5d7900 | |||
| 865cd3db54 | |||
| 90c8fbf07c | |||
| f4ee0d1042 | |||
|
|
e75da7f8cc | ||
|
|
12e9f6700e | ||
|
bca7dd743b
|
|||
|
8d6c6b819b
|
|||
|
|
d07a537f0b | ||
|
|
5e4994a64c | ||
|
|
a5a1fd1a6a | ||
|
|
49a1748641 | ||
|
|
a2f0b57ab9 | ||
| 0dff9fa07f | |||
| 1feb3baf68 | |||
| d21943a514 | |||
| 035ac2384e | |||
| ac7eb93141 | |||
|
|
51e9d33f9f | ||
|
|
d1e7ea09bc | ||
|
|
7dd3ee3084 | ||
| 1980ef5f43 | |||
| fd9b76c6a7 | |||
| abdd7ea6f1 | |||
| c7b366f35f | |||
|
|
84e715a273 | ||
| 396a628175 | |||
| 624746f34b | |||
| 2b395a94e6 | |||
| f6aa40d927 | |||
| c920c57f5d | |||
| 363e839c49 | |||
|
|
e681e9e7ec | ||
| a8194de492 | |||
|
|
1145b31a49 | ||
|
|
b7df1f56ef | ||
|
|
c43d4a0f16 | ||
| a8d385a1ee | |||
| 5579b6f40c | |||
| 7123a8c1cc | |||
|
|
9cc09145ec | ||
| 6294f8e263 | |||
| 0adf2bad92 | |||
| a85f72fccd | |||
| db8772dc0b | |||
| fa7727c6ca | |||
|
|
5655639320 | ||
|
|
720f40c9c9 | ||
| f671d8df90 | |||
| 265da42385 | |||
| b160284a1b | |||
|
|
fcb37b0367 | ||
| 0984c1d431 | |||
|
|
276559d120 | ||
|
|
e3148b16eb | ||
|
|
4d13c37008 | ||
|
|
84d7a7aa7d | ||
|
|
5616801f3e | ||
|
|
b5f6ee9c0c | ||
|
|
af73ce9c6d | ||
|
|
a7a95bb866 | ||
|
5d7dd62b72
|
|||
|
46fb52d67e
|
|||
|
|
39b8356683 | ||
|
42ce598865
|
|||
|
0d62a300e7
|
|||
|
|
3cf88f757c | ||
|
75a74c162d
|
|||
|
248f11f4f8
|
|||
|
|
75e849922d | ||
|
|
d39b955b25 | ||
|
00a41373e8
|
|||
|
|
e9cd6b4225 | ||
|
|
13cca1ee62 | ||
|
|
7b4e2fcf59 | ||
|
|
f2285e603b | ||
|
|
b7bd8210e5 | ||
|
|
1791e665aa | ||
|
|
a71341064e | ||
|
|
74dbbaa794 | ||
|
|
732fab4a04 | ||
|
|
aa3fcbfe17 | ||
| 5e58c9f376 | |||
| b600eeca5e | |||
|
|
e6662c4592 | ||
|
|
1ffcc5e241 | ||
|
|
32f0664012 | ||
|
|
2ef1826b12 | ||
|
|
d397457ce6 | ||
|
|
e8c81ba7d4 | ||
|
|
318dbd65e0 | ||
|
|
dd56e75b50 | ||
|
|
df93786474 | ||
|
|
4deec9a170 | ||
|
|
f26cabbdf1 | ||
|
|
195a1edcfe | ||
|
|
7101d2bb3b | ||
|
|
3452891613 | ||
|
|
b25abc5f16 | ||
|
|
60a847922e | ||
| 0d857b49a2 | |||
| eb5aa9ad02 | |||
| 98661aad15 | |||
| 69739ffdfd | |||
| 95689e3c99 | |||
|
|
9d9babe94d | ||
| 9d15a87c88 | |||
|
|
719aaaff4b | ||
| bbde91a1f9 | |||
| 55cb2cb6d6 | |||
| 752e19c276 | |||
|
|
dac382af53 | ||
| 28a3ff8d67 | |||
|
|
ae81687da9 | ||
|
|
2173d3527d | ||
| 2859f12dc1 | |||
| b307e885ce | |||
| 4853814228 | |||
| ca08717b9d | |||
|
|
934bc13c2c | ||
|
|
4aa337ccc8 | ||
|
|
700f2aad55 | ||
|
|
836e6e4242 | ||
| b5182c4c13 | |||
|
|
808e281ee8 | ||
| b9e65b50db | |||
| 0ea836c69c | |||
|
|
e074bb315c | ||
|
|
084d00cb0d | ||
| 7ecfc8468e | |||
| c782043c64 | |||
|
|
fbf4004e92 | ||
|
|
a2c1b65f91 | ||
|
|
0af550bf4e | ||
|
|
436194e46d | ||
|
|
49938bcef8 | ||
|
|
da2a78faa3 | ||
|
|
98dc8cf5b5 | ||
|
|
cd810b45ec | ||
|
|
22b1d4d276 | ||
|
|
25c5457ef3 | ||
|
|
ea6b9d910b | ||
|
|
5567371ccd | ||
|
|
585c4fcace | ||
| 525d99140f | |||
| 499b4287f9 | |||
|
|
b7df4f7cca | ||
| f41301036b | |||
| 30516776e5 | |||
| 07afcc4cd4 | |||
|
|
05abea87e7 | ||
|
|
4459840f5f | ||
|
|
55e0456aac | ||
|
|
f18ae35030 | ||
|
|
f416be77f7 | ||
| 1d4c79c821 | |||
| d4edbd7d1a | |||
| 5281f3bb60 | |||
|
|
e91fbf405f | ||
|
|
1d41ff8190 | ||
|
|
77a2a256e4 | ||
|
|
eb09504306 | ||
|
|
b912be5978 | ||
|
|
1a41629535 | ||
|
|
b81d9b05ac | ||
|
|
1d62ee1e22 | ||
|
|
55d2c7d7eb | ||
|
|
bb527fb410 | ||
| 9a97d0e8eb | |||
| 93dcfee8c5 | |||
| 76139ef53c | |||
|
|
32319adf72 | ||
|
|
10a5c89a16 | ||
|
|
40bff1eff9 | ||
|
|
ceba4eb0c6 | ||
|
|
faacf3f343 | ||
|
|
7cd98c4f25 | ||
|
|
489ad44b9f | ||
|
|
02a8cf05d1 | ||
| 7db2bbe6b0 | |||
| b6f0faa97f | |||
| a3fffa8e8b | |||
| 72248defbf | |||
| 155e05495e | |||
| 9c92a7796b | |||
| 7c78407c49 | |||
| cb219b3c74 | |||
| d59aa2e855 | |||
|
|
cd3d133f0d | ||
|
|
3b7fc44ce9 | ||
| e1efc68476 | |||
| 8f0bb907ff | |||
|
|
e5c620ca20 | ||
|
|
d0bcfb90e6 | ||
|
|
9deee54e41 | ||
|
|
94b86ef11a | ||
|
|
d8cd752dcb | ||
|
|
5d376e6865 | ||
| 9c3beddf54 | |||
| c6465ad9e5 | |||
| d415381d4a | |||
| 211d4fae54 | |||
|
|
3276ed7785 | ||
|
|
77b7548ef3 | ||
|
|
59851f410e | ||
|
|
4cb8d648cb | ||
| c8627a13f4 | |||
| 0ea0270fe1 | |||
| 19402d30af | |||
| b2f870e3c0 | |||
| 9e542dc200 | |||
| 6cf59043a3 | |||
| 71b75eea0e | |||
|
e900a686db
|
|||
|
fb8db3c3ae
|
|||
|
|
170a9ace8a | ||
|
|
518e9950ea | ||
|
25c8fca561
|
|||
| 754f7e16f6 | |||
| 04a2e460ae | |||
| 2ebab1e2e2 | |||
| a9366d14c6 | |||
| 42809e3f75 | |||
| 4cec933349 | |||
| d3f3c532b1 | |||
| ad1e87d0b8 | |||
|
|
affa85c086 | ||
|
|
aa053d78f7 | ||
|
|
fae6d9d835 | ||
|
|
78f1db7ad1 | ||
| f1367f84f8 | |||
|
|
4c81696f4d | ||
|
|
a91f8f72e3 | ||
|
|
87f7ed329c | ||
|
|
8641d9053d | ||
|
|
4a5ab8a279 | ||
|
|
d179412ab6 | ||
|
|
968c7d179d | ||
| 56399523d7 | |||
| 4d6326b8be | |||
|
|
a2414791bf | ||
|
|
faf3a19f0c | ||
|
|
4e6038d6c1 | ||
|
|
ddc2ecf829 | ||
| ecb5aef735 | |||
| 11ec2267da | |||
| 8576ae458d | |||
|
|
c66445acb5 | ||
|
|
29a20f7b0b | ||
|
|
874c019fb6 | ||
|
|
54825626de | ||
| 9bf5c5dc1a | |||
| 64fef9774c | |||
| 999667ec0c | |||
| c1135531ba | |||
| 287256e5f1 | |||
| 0bc26aa194 | |||
|
|
502d7e9084 | ||
|
|
89875db4a9 | ||
|
|
5a8b929448 | ||
|
|
fe78f2f433 | ||
|
|
e37591ce6d | ||
| 1cd4a57bd3 | |||
| b35172e2f7 | |||
| 3cfcd30128 | |||
| e56532e5c8 | |||
| fdee4f8938 | |||
|
|
7acc89e42d | ||
|
|
af7d208c21 | ||
|
|
91b90d033e | ||
|
|
7a0975b94d | ||
|
|
c58b01a602 | ||
|
|
8244449646 | ||
|
|
436afa4a61 | ||
|
|
998f800632 | ||
| 06ed056d43 | |||
| d446c13546 | |||
| 6e74fa294a | |||
|
|
43bdb56072 | ||
|
|
10a0b0add8 | ||
| e707fd0893 | |||
|
|
19c8e9beb1 | ||
|
|
32e5353847 | ||
|
|
d2f2d78954 | ||
| b8fdfc30c0 | |||
| 79a2ca8ae8 | |||
| d1a78c13a4 | |||
| f4b00e9de1 | |||
| 0a5e155096 | |||
| 4ecc050c4c | |||
| 88dc5036b3 | |||
| d30c6ef3bf | |||
|
0419fec810
|
|||
|
43e5fd1131
|
|||
|
|
11e94124cc | ||
|
|
102109388b | ||
|
|
60a69aa0a2 | ||
| 5e2cbd75fa | |||
| 14f1192ccb | |||
| 72b2560ecf | |||
| 7fce6fa401 | |||
| e6286768a7 | |||
| 0306723307 | |||
| 6f49998ad3 | |||
| 457c944ec6 | |||
| 33c38f9464 | |||
| 46351389b6 | |||
|
|
d56b0e93db | ||
|
|
f9aa47ea1c | ||
| d567a5312e | |||
| 97a322354f | |||
| 554527445b | |||
|
|
c5aff1a2ca | ||
| 987cc40318 | |||
| 104fd1576a | |||
| 72ce3954b4 | |||
| cfa7461855 | |||
| 44cda8a232 | |||
| cf119e6843 | |||
|
|
451744f321 | ||
|
|
ca6682b94b | ||
|
|
cbad2341c3 | ||
|
|
a956c7b135 | ||
|
|
ea6caeb2f0 | ||
|
|
c17e8b1156 | ||
|
|
b993b1e096 | ||
| d7d81e352d | |||
| 078c608bda | |||
| f2e57f9edd | |||
| 5698d5216f | |||
| 10aa2bfbd3 | |||
| 6cfed989ff | |||
| ab70acd582 | |||
|
|
79e1c236fe | ||
|
|
fed62b6c45 | ||
|
|
0d62181272 | ||
|
|
290a71bd48 | ||
|
|
6e385db378 | ||
|
|
ffe8329b84 | ||
| f13be109c2 | |||
| d24d85b970 | |||
| 8d44ac90ad | |||
|
|
4083de2a51 | ||
|
|
131df075db | ||
|
|
afd6f50ba2 | ||
|
|
ad01366705 | ||
| 6325793902 | |||
|
|
8ea176f9da | ||
| 03b5272e44 | |||
| 7da01975f7 | |||
| 7cff8bbfd2 | |||
|
|
c98cbb33f8 | ||
| f3ea95535b | |||
| b9b84b7971 | |||
| be7340ca30 | |||
| 881c4566dd | |||
| 7efbb0217f | |||
| 9e2ce39cde | |||
|
|
0ff6cae1c3 | ||
|
|
d02ba3d717 | ||
| 6aa830adb6 | |||
| be6603cbb9 | |||
|
|
8d208929d5 | ||
|
|
cb0f96b737 | ||
|
|
83723ab050 | ||
|
|
3abaefa550 | ||
|
|
389010dbbd | ||
| 81fe2c043e | |||
| c76e9bb3fe | |||
| 48b68d3410 | |||
| 2b64b31393 | |||
| 2333068de7 | |||
| 78530029ef | |||
| 329b6e5640 | |||
|
|
967f0a3294 | ||
|
|
6eb779d359 | ||
|
|
414147177a | ||
|
|
3b37f3630c | ||
|
|
7c1a818582 | ||
|
|
c4cf7e9707 | ||
|
|
1ceb681521 | ||
|
|
443176a0d1 | ||
|
|
261905a364 | ||
| e00288b160 | |||
| f141ca926f | |||
| f7a0954213 | |||
|
|
da8d562eba | ||
|
|
399af8592c | ||
| 6239e7f19b | |||
| d0e1b7186c | |||
| fea3292f50 | |||
| 9973aa9ffa | |||
| 0b38a980d2 | |||
| 20838b6882 | |||
| 8f4ef1e274 | |||
| e1c7583670 | |||
| 39a2157d46 | |||
| dd63e7157a | |||
| 340efd7926 | |||
| ecc6194b57 | |||
|
|
90c3381954 | ||
|
|
21334c8026 | ||
|
|
cbdef6ce9e | ||
|
|
591cd9fd66 | ||
|
|
e8d2a45afb | ||
|
|
3b533938a6 | ||
|
|
9fe342a7e9 | ||
|
|
2152ced97a | ||
|
|
404be5f317 | ||
|
|
f56783a439 | ||
|
|
fb278182d3 | ||
|
|
c2c63d2f67 | ||
|
|
7f740455fe | ||
|
|
946b992746 | ||
|
|
a6c43e6f2f | ||
|
|
ecad52c18d | ||
|
|
e49e5a0474 | ||
|
|
9231b3cfca | ||
|
|
68e0159292 | ||
|
|
1a674590bf | ||
|
|
1ef47e7b3f | ||
|
|
214a2762df | ||
| cb5d06decd | |||
| 8555a88202 | |||
|
|
2287f4493a | ||
|
|
bb357f7cab | ||
|
|
d9b240cd2d | ||
|
|
bea5ee96d9 | ||
|
|
7d205fd526 | ||
|
|
c15b2a0cbb | ||
|
|
7ccba30a3d | ||
|
|
8091485588 | ||
|
|
1413f968d6 | ||
|
|
d1d1bb09e9 | ||
|
|
3c1a7e0171 | ||
|
|
0cb50f2f01 | ||
|
|
2287586700 | ||
|
|
ea7660ddb3 | ||
|
|
44e98e8f2f | ||
|
|
856ccbb969 | ||
|
|
0920286b4c | ||
|
|
f34e10cfd9 | ||
| ae5d202661 | |||
| bc43c844fc | |||
| 67be9aa27b | |||
| 047b997a22 | |||
| bac51891b7 | |||
|
|
714d6af7cd | ||
| 6efd6334bb | |||
| 91f4475d76 | |||
|
|
de309784b4 | ||
|
|
a623cf53f3 | ||
| 440cd59e50 | |||
| eefb6f6265 | |||
| f5e1226837 | |||
| 151f7e701f | |||
| 40398497c2 | |||
|
|
cda10788fb | ||
|
|
845905d9c8 | ||
| 89055506d6 | |||
|
|
5908ae7905 | ||
|
|
4131665284 | ||
|
|
6a43dfb0d7 | ||
| 3d38d78845 | |||
| 600f19ac80 | |||
|
|
0a3a664653 | ||
|
|
471ec1cd2e | ||
|
|
e296cd7ca0 | ||
|
|
31cfa8cd7c | ||
|
|
70fe8aa367 | ||
|
|
cc9dafac6f | ||
|
|
32429f1481 | ||
| 9485a463b8 | |||
| 35c6ab4a08 | |||
| e58b0fa015 | |||
| beb92967e5 | |||
| 015583f1cd | |||
| d40c54b802 | |||
| 647665b6b9 | |||
| 4fc78bc382 | |||
| 50d000e7e2 | |||
|
|
ad500c4bef | ||
|
|
916077c6f8 | ||
|
|
935fb238a4 | ||
|
|
d03e5b4562 | ||
|
|
05c45c6468 | ||
| 9020613a8b | |||
| be92d5943d | |||
|
|
b2368a0751 | ||
| 7948d5f773 | |||
|
|
1a16851ad0 | ||
|
|
810c14a839 | ||
|
|
df0e8eb228 | ||
| 79605c8a9e | |||
|
|
9b644119ae | ||
|
|
ffa9919019 | ||
|
55ca892f90
|
|||
|
eaca187032
|
|||
|
|
3b9d05cc6d | ||
| d00881de2e | |||
| d8e85cf75d | |||
| 39f21763e4 | |||
|
|
af43901ca3 | ||
|
|
62565b9ae2 | ||
|
|
bca176170c | ||
|
|
2a91ca0cff | ||
|
|
19a75554b0 | ||
|
|
58ae476a3e | ||
|
|
44d8254a0b | ||
|
|
bd2cdfcef2 | ||
| a50b832c2a | |||
|
|
10194105e3 | ||
|
|
b474288df7 | ||
|
|
f338209f32 | ||
|
|
bef832e45b | ||
|
|
71cfb4db77 | ||
| 86453e7e11 | |||
|
|
98b9f8e62d | ||
| 44cd8d258d | |||
| 764b65d094 | |||
|
|
4d2c64b012 | ||
|
|
35c0b0be58 | ||
|
|
7a54e2cfb3 | ||
|
|
54283f6d3c | ||
|
|
697acd1d88 | ||
|
|
5cdb80b4d6 | ||
|
|
e48ff8be73 | ||
|
|
096217eea6 | ||
|
|
ed5290be86 | ||
|
|
b036c3903c | ||
|
|
57b43b7b60 | ||
| ab1ddb7bd1 | |||
| 881f2f32f4 | |||
| 0754ba5292 | |||
|
|
743a89c3a2 | ||
|
|
6692c3ab7c | ||
|
|
c16a5fdac4 | ||
|
|
60ec7e54f5 | ||
| dd48f5ab87 | |||
|
|
db674ec31d | ||
|
|
48150ffc8b | ||
|
|
1ad80efab6 | ||
|
|
aa8789f8f8 | ||
|
|
56e3f2da5c | ||
|
|
a4104822e2 | ||
|
|
c13f386e3b | ||
| 4bd73450b5 | |||
| 64da28e814 | |||
| 639e1b9c6d | |||
|
|
63e828d2df | ||
|
|
b8c30b5703 | ||
|
|
805ea91fc2 | ||
|
|
c4c422da57 | ||
| 544fb35121 | |||
| 43edccb284 | |||
| 7531ba4b5c | |||
| 983aa592d8 | |||
| 8378784231 | |||
| dca25cc601 | |||
|
|
c8fe81cd80 | ||
| c0a4724f57 | |||
| 484c52d813 | |||
|
|
47843b2087 | ||
|
|
c3a6126799 | ||
|
|
e94b250541 | ||
|
|
db5f6c7540 | ||
|
|
79a6c9e90d | ||
| e2e67e3977 | |||
| 6c06450701 | |||
|
|
d7379a1af2 | ||
|
|
d731611e0c | ||
|
|
dceb92ba8e | ||
|
|
1e039cb1bf | ||
|
6f3e1ffbe3
|
|||
|
|
6a6dca3fce | ||
|
|
d6d92071bf | ||
|
|
d40657dc64 | ||
|
|
6dde2a1e59 | ||
|
|
b7823cec16 | ||
|
|
eabd7b8d51 | ||
|
|
27ec445e54 | ||
|
|
ad108b285f | ||
|
|
f471214ef7 | ||
|
|
a0190f8f40 | ||
| 82af984023 | |||
| 0373010497 | |||
|
|
c22d869aa7 | ||
| 87c93e90cd | |||
| 3d6dca9386 | |||
|
|
f946e7e6ab | ||
|
|
d50dfa5867 | ||
| 249128e011 | |||
| ca16a80b1f | |||
|
|
e789e7ba9b | ||
|
|
5048f7be14 | ||
|
|
0e3603f596 | ||
| 9cd4b3c1cc | |||
| 1d9aa75960 | |||
|
|
0a24ef70e0 | ||
| 3b5d3d671e | |||
| 7db83d216e | |||
| d1a7002422 | |||
| 1d8e7e072f | |||
|
7466fe7a34
|
|||
|
|
24cf5047da | ||
|
|
1f103e5ef5 | ||
|
|
9e87974eb1 | ||
|
|
d806cf76c4 | ||
|
|
6e2703998d | ||
| 6f9737c2c2 | |||
|
|
5e696c10d5 | ||
|
|
927e25c72c | ||
| 8b1b99ba35 | |||
| 2c102cd1ff | |||
|
|
42c4926c47 | ||
|
|
703556d893 | ||
|
|
0b529a5c3c | ||
|
|
5186b3f61e | ||
| 4dc0da5099 | |||
| 1bad6ba065 | |||
| 3efee22536 | |||
| eef48ac3a3 | |||
| e35cfbc3dd | |||
| 4a5fd96b32 | |||
|
|
bdffe73f59 | ||
| cdfe722457 | |||
| 0aecea6de2 | |||
| 5a88c77171 | |||
| 8003217092 | |||
| 9b325041c1 | |||
| 1e7fbe5d56 | |||
| 0261c263f9 | |||
| 8d6ae85b0d | |||
| f14bdb3068 | |||
| 3c66840f95 | |||
| 733e3ea9d5 | |||
|
ca634bb707
|
|||
| 9abc206d1a | |||
| 85f17c0fd8 | |||
| 14bad81b9f | |||
|
|
ffd596e2c7 | ||
| 99f8187092 | |||
| f30b784f45 | |||
| f06b5f8fc0 | |||
| 2e781b900d | |||
| d76b1ae75d | |||
| 40110580e0 | |||
| eab7961a83 | |||
| 432e06e801 | |||
| fe1ff5c7a3 | |||
| 6e66b8e08b | |||
| 7abdd0545e | |||
|
|
3f1768e467 | ||
|
|
f464921ae3 | ||
|
|
7603ad3fb0 | ||
|
|
be7ccc78b8 | ||
|
|
b3135c982f | ||
|
13386175f5
|
|||
|
23e8f3dc2d
|
|||
|
|
b323ce2eef | ||
|
|
08e323ba51 | ||
|
|
9f50f36b1d | ||
|
|
4399c1d590 | ||
|
|
f7376f6dca | ||
|
|
518cb34340 | ||
|
|
f210a5f508 | ||
|
|
9ebc49dd1c | ||
|
|
c119eeb468 | ||
|
|
ab616f8f79 | ||
|
|
69286881e4 | ||
|
|
4419df8d1b | ||
|
|
aed2bd48fc | ||
|
|
d3d752f90c | ||
|
|
33ecfe88ef | ||
|
|
fd52fdd35b | ||
|
|
1d13d3dccf | ||
|
|
1c84bcae35 | ||
|
|
df497d5952 | ||
|
|
f65e122f8d | ||
| 161f0744aa | |||
| 95de9ad3b3 | |||
|
|
d5c170055f | ||
|
|
61f0521072 | ||
|
|
6ca14c55f2 | ||
|
|
1309d09aee | ||
| aba75b3a19 | |||
|
|
e87481d8db | ||
| acaad69917 | |||
|
|
ff588ad57a | ||
| 65df27154c | |||
| 8dfa1957f4 | |||
| 570eba3794 | |||
| 94a39fc61f | |||
| 2d359e5f99 | |||
|
|
04692e0c44 | ||
|
|
809fd23b88 | ||
|
|
e3653daea3 | ||
|
|
48fa75386c | ||
|
|
1b3a12a4dc | ||
|
|
543ddf540e | ||
|
|
a3fb471546 | ||
|
|
277f964b30 | ||
|
|
9bcf7adb67 | ||
|
|
f343fa0071 | ||
|
|
e5862e9218 | ||
|
|
29ae2423f8 | ||
|
|
1755a4a7df | ||
|
|
25d3325049 | ||
|
|
fb6a4c3b87 | ||
| 317f80a984 | |||
| 28cdc1d9e5 | |||
| c2087b15d5 | |||
| a8d785beb3 | |||
|
|
a6784b5549 | ||
|
|
d770292be8 | ||
|
|
b3a1037ade | ||
|
|
02946cf0b4 | ||
|
|
cf051d5108 | ||
|
|
96977c6183 | ||
|
|
73d83164fc | ||
|
|
1064f5e4a8 | ||
|
|
5be98c7087 | ||
|
|
0d689c7dff | ||
|
|
1f24ed46a0 | ||
|
|
92b4159f9e | ||
|
|
5817b41e29 | ||
| d6b132e3a6 | |||
|
|
318f70f34c | ||
|
|
e41525d40a | ||
|
|
a102220e52 | ||
|
|
e9a214c5b2 | ||
|
|
c53f5eb144 | ||
|
|
9ed64e0388 | ||
|
|
93040d4629 | ||
|
|
0144ad43f5 | ||
|
|
8da2fc30c3 | ||
| 0e27ae7795 | |||
| 33c6cdb9fe | |||
|
|
73b7014469 | ||
| 25aaf55b93 | |||
| 6a7546c43b | |||
| 0adda4bf7b | |||
|
|
f5f36427a4 | ||
|
|
590bfd3a10 | ||
|
|
16db9bd1a2 | ||
|
|
d0af933b35 | ||
|
|
2b56b40e6d | ||
|
|
4b2d7068b3 | ||
|
|
bd93b8be8e | ||
|
|
aa3fe2b872 | ||
|
|
a61ff915ac | ||
|
|
0a3e678329 | ||
|
|
d4336b0dcb | ||
|
|
65d2698af4 | ||
|
|
6454576417 | ||
|
|
a485bd5977 | ||
|
|
e733688fd0 | ||
|
|
e86f6a8cbd | ||
|
|
fcc9e17664 | ||
|
|
5c9d4ffa9a | ||
|
|
419bc2747b | ||
|
|
1ee99d6866 | ||
|
|
3ab8973895 | ||
|
|
acfa3baeb5 | ||
|
|
c21d7cf101 | ||
|
|
ec895e1d9e | ||
|
|
c964f09a4f | ||
|
|
0bc32f27df | ||
|
|
6640e93ce9 | ||
|
|
d7aefe0cf0 | ||
|
|
187fe5b361 | ||
|
|
b31aea7bc5 | ||
|
c661baf058
|
|||
|
|
0fe0461340 | ||
|
|
d5394c9e92 | ||
|
|
42135fd26c | ||
|
|
38569f55c7 | ||
|
|
5ce03c2db3 | ||
|
|
1031b3eb79 | ||
|
|
fcdf4cd476 | ||
| 6268dffff8 | |||
| c10737bfd7 | |||
|
|
bd0cc69668 | ||
|
|
84fffac264 | ||
|
|
5bf968010e | ||
|
|
61bc095d01 | ||
|
|
e376f97547 | ||
|
|
f2428d3cb3 | ||
|
|
2fdac85d31 | ||
|
|
b731395689 | ||
|
|
07405e3466 | ||
|
|
fc0c76bd77 | ||
|
|
d209547968 | ||
| 632b9fc5ea | |||
| 702591b4ec | |||
|
|
c562746e5f | ||
|
|
c0443cbec2 | ||
|
|
0191bc3821 | ||
|
|
633bd42036 | ||
|
|
998ef8d834 | ||
|
|
c25b076ca9 | ||
|
|
f43379f365 | ||
|
|
d902c0acf4 | ||
|
|
58e678d72c | ||
|
|
cbc49669d0 | ||
|
|
78bb638fd6 | ||
|
|
7a61bae471 | ||
|
|
e1b992526e | ||
|
|
1b043838ea | ||
|
|
07e72294dc | ||
|
|
b6b37ee68b | ||
|
|
43cb1f1bff | ||
|
|
f7a67c72bf | ||
|
|
c5476d08fa | ||
|
|
8af92b1557 | ||
|
|
eaa826bb8a | ||
|
|
140b3c371d | ||
|
|
f158eaa29c | ||
|
|
c4b98ade53 | ||
|
|
f2e85306ca | ||
|
|
42b9de8360 | ||
|
|
6c244f3121 | ||
|
|
9f56213d2f | ||
|
|
fb2f7cf680 | ||
|
|
8fcdd24f84 | ||
|
|
aaafde4a7c | ||
|
|
2b23003556 | ||
|
|
5681062f01 | ||
|
|
d61bf212f5 | ||
|
|
2bd7c8d51e | ||
|
|
1e63cdbcda | ||
|
|
86d85f12be | ||
|
|
dd470d49ec | ||
|
|
95d8062b00 | ||
|
|
8f82399214 | ||
|
|
6247150e9c | ||
|
5266644725
|
|||
|
81d9e96552
|
|||
|
|
4ec9f06114 | ||
|
0033e9f6c0
|
|||
|
571652c314
|
|||
|
|
7ec233e18a | ||
|
|
13c9a12336 | ||
|
|
83d472ecd6 | ||
|
|
c21da6512a | ||
|
|
4b4374e0df | ||
|
|
407276a04d | ||
|
|
64f60905b4 | ||
|
|
9e6072fed2 | ||
|
|
a3e5c424fd | ||
|
|
6683a350aa | ||
|
|
05bfa9b546 | ||
|
|
735988decb | ||
|
|
d0580592be | ||
|
|
817076bdbf | ||
|
|
736236e9ca | ||
|
|
3f4114c51b | ||
|
|
5c2c493c56 | ||
|
|
2c383ebea1 | ||
|
|
91e73450cf | ||
|
|
e55798944e | ||
|
|
5ea11a5ad2 | ||
|
|
2a3383e9e6 | ||
|
|
e871703724 | ||
|
|
1ee367d7be | ||
|
|
bce536b9b4 | ||
|
|
7c9182e0b0 | ||
|
|
aa915d639d | ||
|
|
9489ebc7d6 | ||
|
2a5c525193
|
|||
|
9e2d981c60
|
|||
|
|
53dfe9e4f5 | ||
|
48e95fbdb0
|
|||
|
fd94d85edf
|
|||
|
f2d1a85afb
|
|||
|
0bdbcb8bab
|
|||
|
|
7b91a819be | ||
| bc89025924 | |||
|
|
16bcaef4c3 | ||
|
|
fcbfa451f2 | ||
|
|
559ce53ca4 | ||
|
|
ee2c5b58d7 | ||
|
|
d98d998106 | ||
|
212c45e070
|
|||
|
143fa9b6ed
|
|||
|
4849928288
|
|||
|
|
9248ee8868 | ||
|
|
1616d96732 | ||
| 0bbedd1600 | |||
|
|
c7e49644d8 | ||
|
010c903c74
|
|||
|
e4d12e3537
|
|||
|
051cc8384e
|
|||
|
49a94170d2
|
|||
|
|
42e8e37bd4 | ||
|
|
5d2c350ce2 | ||
|
|
85dc0362c1 | ||
|
|
01c06728eb | ||
|
|
257250714d | ||
|
|
3b769c3059 | ||
|
|
a7395ed45b | ||
|
|
ab07c7928f | ||
|
|
b0c0d15505 | ||
|
|
fcf50790da | ||
|
|
1e43654607 | ||
|
|
4fecbe820d | ||
|
|
763c9dfa6b | ||
|
9de5879786
|
|||
|
|
9396e7492c | ||
|
3ac3415178
|
|||
|
1aae1c59d0
|
|||
|
907e80a01c
|
|||
|
|
8a10b69716 | ||
|
|
1a3cf7edd6 | ||
|
|
76d0fc979b | ||
|
|
a42d8ece35 | ||
|
|
93377f53fc | ||
|
|
c853d74ba0 | ||
|
|
0b9f74f4f4 | ||
|
|
5da6baf828 | ||
|
5766945006
|
|||
|
a53d473b58
|
|||
|
|
d1207ad80e | ||
|
|
e2efe71b33 | ||
|
|
2aef6ed9c0 | ||
|
|
fcb6db0603 | ||
| 01b1136316 | |||
|
|
2512fe9e75 | ||
|
|
f89b5cd2ec | ||
|
|
ab284ed208 | ||
|
|
00a578657c | ||
|
|
38ce40ae7d | ||
| e1be6c7138 | |||
| 28539e60b0 | |||
|
adb11b3ed0
|
|||
|
|
f1e6dedd44 | ||
|
|
8ea1454c06 | ||
| 81b8d578f2 | |||
|
|
16b11db39c | ||
| 0d923cc920 | |||
| c523e93564 | |||
| d588798ea1 | |||
| a11f165f2a | |||
|
|
d4f487d554 | ||
|
|
93d5a0e532 | ||
|
|
00ddc462d2 | ||
|
|
5f4a74f8ba | ||
|
|
a8eff6fbd1 | ||
|
|
baa7367ebe | ||
|
|
69f8a34aac | ||
|
|
21b3a67988 | ||
|
|
d89574ce73 | ||
| ddeac6b9d9 | |||
| 17906ec0eb | |||
|
|
311c088d3d | ||
| a2584d6083 | |||
| 35bd7739c6 | |||
| 7f43c88a39 | |||
|
|
fc1c54a141 | ||
|
|
2af111c584 | ||
| c093cca8b1 | |||
|
|
2bb1b78ba4 | ||
| 3ab26172c4 | |||
| cdd45ce88b | |||
|
210a7d3136
|
|||
|
92ec64d80f
|
|||
|
ff37f71fdb
|
|||
|
6056341525
|
|||
|
|
075612f5bd | ||
| 1a87ed8210 | |||
|
|
c05ffeb16d | ||
| ee3710c5ed | |||
| 4327c4b1f7 | |||
| 492e56a098 | |||
| f0257a2784 | |||
| ec1ead89ab | |||
|
|
ae53e87aba | ||
|
|
939dd2320a | ||
|
|
2c8b73e2e2 | ||
|
|
eabc6212ea | ||
|
|
c120d6517f | ||
|
|
597ee1dad7 | ||
|
|
c4a901504d | ||
|
|
f5cc5d07fd | ||
|
|
8a0e6c921c | ||
|
|
bf1bff9ace | ||
|
|
06f24e988f | ||
|
|
ae327f545e | ||
|
|
35012b18c5 | ||
|
|
9688bad622 | ||
|
|
447b8d3372 | ||
|
|
01102cb9b0 | ||
|
|
934d1a6114 | ||
|
|
6f74c8cb77 | ||
|
|
63b9e619a4 | ||
|
|
82e28f26d7 | ||
|
|
ca9fd96baa | ||
|
|
39b22267d6 | ||
|
|
60d7984d66 | ||
|
|
33d219d2ac | ||
|
|
85a77e05af | ||
|
|
3dfeabcec6 | ||
|
|
673fdc443c | ||
|
|
2f6e5a7648 | ||
|
|
2cbe8e9517 | ||
|
|
2f0460d6ec | ||
|
|
37f4ed7770 | ||
|
|
e3104c61cb | ||
|
|
bc434ee8cb | ||
|
|
f4102b948e | ||
|
|
ed991de11a | ||
|
|
322e161064 | ||
|
|
1adc741cc2 | ||
|
|
4eff87bbf7 | ||
|
|
fc6970d08a | ||
|
|
f616c7e1c6 | ||
|
|
89ec749172 | ||
|
|
182f0f2c64 | ||
|
|
e3681495ce | ||
|
|
37415fa261 | ||
|
|
7243dbe763 | ||
|
|
0ff5c4bedd | ||
|
|
f047f89ad5 | ||
|
|
0eb0aa1d3b | ||
|
|
6019891591 | ||
|
|
615281601c | ||
|
|
82baf5d384 | ||
|
|
6fe93ecb7e | ||
|
|
b3222f3523 | ||
|
|
3b94863521 | ||
|
|
582dc8bf46 | ||
|
|
a9868fd275 | ||
|
|
218e56576a | ||
|
|
c50e79375a | ||
|
|
dcb8308f35 | ||
|
|
183b310696 | ||
|
|
c7d0c86d52 | ||
|
|
48225662b1 | ||
|
|
f53fc088ec | ||
|
|
05517fcbcd | ||
|
|
18af51b0a4 | ||
|
|
ede3da7a87 | ||
|
|
8e3327ef6a | ||
|
|
827f6daabc | ||
|
|
2567442321 | ||
|
|
9cf5478519 | ||
|
|
e5275311c2 | ||
|
|
21e4870e4c | ||
|
|
beba7c8d2e | ||
|
|
fe35313305 | ||
|
|
d7a8bbf40b | ||
|
|
f1893c596e | ||
|
|
6367c1ab4d | ||
|
|
9579887fc4 | ||
|
|
e29be2f140 | ||
|
|
2736b5d1ef | ||
|
|
ff52fb16b6 | ||
|
|
ccbf3867e1 | ||
|
|
f0de422c6e | ||
|
|
64cc19b252 | ||
|
|
26226009f0 | ||
|
|
d10e09da02 | ||
|
|
00a2e58fee | ||
|
|
b1cb45dfe6 | ||
|
|
a2951d1f05 | ||
|
|
c0b1e97602 | ||
|
|
71621a9dc4 | ||
|
|
b3ed2afebe | ||
|
|
704620baff | ||
|
|
8feb805167 | ||
|
|
065b32755a | ||
|
|
1b5f4bff2c | ||
|
|
8e1c5a485f | ||
| 5fa6c9db35 | |||
| 5482b9be2c | |||
|
|
7400273b0a | ||
|
|
0b7cdde4a0 | ||
|
|
d5382aec4f | ||
|
|
df484dc816 | ||
|
|
7ea4086807 | ||
|
|
b04bf6a951 | ||
| 7c33dcf630 | |||
| 5e65e21f0b | |||
| 53ca38ce53 | |||
|
|
398e3c1b91 | ||
| 508978d586 | |||
| e267481f71 | |||
|
|
193bee5ac8 | ||
| f58efa2871 | |||
| 6568b6d723 | |||
|
|
4b1b34d8a7 | ||
| 39c09f8565 | |||
|
|
275a77807e | ||
|
|
6443541a79 | ||
|
|
5eb6f7d307 | ||
|
|
bce2a66177 | ||
|
|
7602641909 | ||
|
|
54f3a261c5 | ||
|
|
906bac965f | ||
|
|
4ec1de6900 | ||
|
|
8ded131666 | ||
| 47b14f932e | |||
|
|
838ebb3f69 | ||
| c459724114 | |||
| b0c9d1164d | |||
| 7c51d88501 | |||
| 5b03cf826b | |||
| f305863616 | |||
| db5809d522 | |||
|
|
83df6f015c | ||
| e7231b0e13 | |||
|
|
cff60eb51c | ||
|
f914a312f5
|
|||
| 56ebb301ca | |||
|
|
a59df12595 | ||
|
|
5cc7fc6ccb | ||
|
|
55027cb630 | ||
|
|
036eba68e1 | ||
|
|
d34e0d9348 | ||
|
|
31765ce0ef | ||
|
|
9fe7cdca92 | ||
|
|
adc3502b6b | ||
|
|
95fe369648 | ||
|
|
01845a0cb7 | ||
|
|
708eaf4178 | ||
|
|
d629a58712 | ||
|
|
90886b63d6 | ||
|
|
084f89fa32 | ||
|
|
ceb3a095d8 | ||
|
|
1758275f11 | ||
|
|
e74e506ffe | ||
|
|
599a36466a | ||
|
|
613e128cab | ||
|
|
e4f8022b7a | ||
|
|
5603c41900 | ||
| a8a27c9b51 | |||
|
|
b70de5a4be | ||
|
|
b1fd07cd30 | ||
|
|
6ab2e02fe6 | ||
|
|
5535c5780c | ||
|
|
49e0a2c055 | ||
|
|
efbe53b6b4 | ||
|
5e074dad10
|
|||
|
d6a88896d0
|
|||
|
5c99f5f8bb
|
|||
|
e1faba0ff2
|
|||
|
ba2f406bc0
|
|||
|
9b6db4684a
|
|||
|
|
561fd41d5d | ||
|
|
ce9995dac7 | ||
|
|
0afaea9513 | ||
|
|
9b5c6e3164 | ||
|
|
e6ebec8c1e | ||
|
|
2551921ed6 | ||
|
|
e02575aad7 | ||
|
|
ff3502c87a | ||
|
|
017f9b2140 | ||
|
|
c80d3a6958 | ||
|
|
3ca1127685 | ||
|
|
18369da5bc | ||
|
|
e65100cdc8 | ||
|
|
6a1cb51c2f | ||
|
c4d93e492b
|
|||
|
c2f72f72ac
|
|||
|
721b6b2afa
|
|||
|
b6f011c669
|
|||
|
801607fc16
|
|||
|
01a4d33514
|
|||
|
e348ec74fd
|
|||
|
0458675608
|
|||
|
c61ffce0e9
|
|||
|
68a97dc980
|
|||
|
a07d167390
|
|||
|
|
a8721dcc69 | ||
|
|
68cf952ac6 | ||
|
|
e14d6a81fe | ||
|
|
a4912893a8 | ||
|
0adfb631ef
|
|||
|
b64ce1f67f
|
|||
|
e8e3b1595d
|
|||
|
f1427d5272
|
|||
|
|
bf6b87d65c | ||
|
|
0240997257 | ||
|
|
f1e341f0b9 | ||
|
a54acb8c42
|
|||
|
c6ede67589
|
|||
|
|
11176da5d8 | ||
|
|
0a604336c4 | ||
|
|
be9df7649f | ||
|
|
63fb923995 | ||
|
|
3afe40083d | ||
|
|
9d4767539c | ||
|
ac9bba8b5b
|
|||
|
80c46bea7f
|
|||
|
|
614f694777 | ||
|
|
1072d7b449 | ||
|
1b70596735
|
|||
|
|
61eebc9fbd | ||
|
b05909969f
|
|||
|
bd89ce7cc9
|
|||
|
130613b717
|
|||
|
b3c1f39a0e
|
|||
|
97c807cd33
|
|||
|
aede5f71ec
|
|||
|
786770f56a
|
|||
|
|
74d4f00784 | ||
|
d61c4235dc
|
|||
|
e8794b8c79
|
|||
|
552da005dc
|
|||
|
|
51452d2e68 | ||
|
5c5484b4d2
|
|||
|
|
684cb5a376 | ||
| 649d50812b | |||
| 2502989ca2 | |||
| ba7cc9168e | |||
| dc0d9fe038 | |||
| 0e6c6937cd | |||
| d839c53642 |
84
.claude/settings.json
Normal file
84
.claude/settings.json
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
{
|
||||||
|
"hooks": {
|
||||||
|
"PostToolUse": [
|
||||||
|
{
|
||||||
|
"matcher": "Task",
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "entire hooks claude-code post-task"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": "TodoWrite",
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "entire hooks claude-code post-todo"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"PreToolUse": [
|
||||||
|
{
|
||||||
|
"matcher": "Task",
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "entire hooks claude-code pre-task"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"SessionEnd": [
|
||||||
|
{
|
||||||
|
"matcher": "",
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "entire hooks claude-code session-end"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"SessionStart": [
|
||||||
|
{
|
||||||
|
"matcher": "",
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "entire hooks claude-code session-start"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Stop": [
|
||||||
|
{
|
||||||
|
"matcher": "",
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "entire hooks claude-code stop"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"UserPromptSubmit": [
|
||||||
|
{
|
||||||
|
"matcher": "",
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "entire hooks claude-code user-prompt-submit"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"permissions": {
|
||||||
|
"deny": [
|
||||||
|
"Read(./.entire/metadata/**)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
4
.entire/.gitignore
vendored
Normal file
4
.entire/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
tmp/
|
||||||
|
settings.local.json
|
||||||
|
metadata/
|
||||||
|
logs/
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
# Session Context
|
||||||
|
|
||||||
|
Session ID: 28147033-ddc8-4056-b064-e0558fbc614e
|
||||||
|
Commit Message: There are memory leaks in production on queries that trigger a full scan
|
||||||
|
|
||||||
|
## Prompts
|
||||||
|
|
||||||
|
### Prompt 1
|
||||||
|
|
||||||
|
There are memory leaks in production on queries that trigger a full scan. Review all queries for missing close and other bugs that could cause memory leaks in the sqlite-3 driver. Example for a query that triggers a massive allocation is a specific jobstate list with sorting on start_time on all jobs.
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
All 31 `.Query()` calls now have proper `rows.Close()`. Build passes, tests pass, vet clean.
|
||||||
|
|
||||||
|
### Summary of changes
|
||||||
|
|
||||||
|
**`internal/repository/stats.go`** (7 fixes + 1 rewrite):
|
||||||
|
- Added `defer rows.Close()` + `rows.Err()` to `JobsStatsGrouped`, `JobCountGrouped`, `AddJobCountGrouped`, `jobsStatisticsHistogram`, `jobsDurationStatisticsHistogram`, `jobsMetricStatisticsHistogram`
|
||||||
|
- Rewrote `AddJobCount` to use `QueryRow()` instead of `Query()` (single-row result)
|
||||||
|
|
||||||
|
**`internal/repository/tags.go`** (5 fixes):
|
||||||
|
- Added `defer xrows.Close()` + `xrows.Err()` for `Queryx` in `CountTags`
|
||||||
|
- Added `defer rows.Close()` for second query in `CountTags`
|
||||||
|
- Added `defer rows.Close()` + `rows.Err()` to `GetTags`, `GetTagsDirect`, `getArchiveTags`
|
||||||
|
|
||||||
|
**`internal/repository/node.go`** (4 fixes + 1 rewrite):
|
||||||
|
- Added `defer rows.Close()` + `rows.Err()`, removed manual `rows.Close()` in `QueryNodes`, `QueryNodesWithMeta`, `CountStatesTimed`
|
||||||
|
- Rewrote `CountStates` to use `COUNT(*) ... GROUP BY` in SQL instead of fetching every row and counting in Go
|
||||||
242
.entire/metadata/28147033-ddc8-4056-b064-e0558fbc614e/full.jsonl
Normal file
242
.entire/metadata/28147033-ddc8-4056-b064-e0558fbc614e/full.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -0,0 +1 @@
|
|||||||
|
There are memory leaks in production on queries that trigger a full scan. Review all queries for missing close and other bugs that could cause memory leaks in the sqlite-3 driver. Example for a query that triggers a massive allocation is a specific jobstate list with sorting on start_time on all jobs.
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
All 31 `.Query()` calls now have proper `rows.Close()`. Build passes, tests pass, vet clean.
|
||||||
|
|
||||||
|
### Summary of changes
|
||||||
|
|
||||||
|
**`internal/repository/stats.go`** (7 fixes + 1 rewrite):
|
||||||
|
- Added `defer rows.Close()` + `rows.Err()` to `JobsStatsGrouped`, `JobCountGrouped`, `AddJobCountGrouped`, `jobsStatisticsHistogram`, `jobsDurationStatisticsHistogram`, `jobsMetricStatisticsHistogram`
|
||||||
|
- Rewrote `AddJobCount` to use `QueryRow()` instead of `Query()` (single-row result)
|
||||||
|
|
||||||
|
**`internal/repository/tags.go`** (5 fixes):
|
||||||
|
- Added `defer xrows.Close()` + `xrows.Err()` for `Queryx` in `CountTags`
|
||||||
|
- Added `defer rows.Close()` for second query in `CountTags`
|
||||||
|
- Added `defer rows.Close()` + `rows.Err()` to `GetTags`, `GetTagsDirect`, `getArchiveTags`
|
||||||
|
|
||||||
|
**`internal/repository/node.go`** (4 fixes + 1 rewrite):
|
||||||
|
- Added `defer rows.Close()` + `rows.Err()`, removed manual `rows.Close()` in `QueryNodes`, `QueryNodesWithMeta`, `CountStatesTimed`
|
||||||
|
- Rewrote `CountStates` to use `COUNT(*) ... GROUP BY` in SQL instead of fetching every row and counting in Go
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
# Session Context
|
||||||
|
|
||||||
|
Session ID: 42401d2e-7d1c-4c0e-abe6-356cb2d48747
|
||||||
|
Commit Message: There also have to be bugs in jobQuery.go . Especially the following que
|
||||||
|
|
||||||
|
## Prompts
|
||||||
|
|
||||||
|
### Prompt 1
|
||||||
|
|
||||||
|
There also have to be bugs in jobQuery.go . Especially the following query triggers the memory leak: SELECT * FROM job WHERE job.job_state IN ("completed", "running", "failed") ORDER BY job.start_time DESC LIMIT 1 OFFSET 10; Dig deeper to find the cause. Also investigate why no existing index is used for this query.
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
All passes. Here's a summary of findings and fixes:
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### Bug 1: Missing `defer rows.Close()` in `user.go:GetLdapUsernames()` (line 100)
|
||||||
|
**Fixed.** Every call leaked a database connection and its buffers.
|
||||||
|
|
||||||
|
### Bug 2: SQLite never runs `ANALYZE` — wrong index selection
|
||||||
|
**Fixed.** Added `ANALYZE` to `setupSqlite()` in `dbConnection.go`.
|
||||||
|
|
||||||
|
Without statistics in `sqlite_stat1`, SQLite's query planner **guessed** and chose `jobs_jobstate_energy` (a terrible index for this query) instead of scanning `jobs_starttime` in reverse. The result:
|
||||||
|
|
||||||
|
| | Before ANALYZE | After ANALYZE |
|
||||||
|
|---|---|---|
|
||||||
|
| Index used | `jobs_jobstate_energy` | `jobs_starttime` |
|
||||||
|
| Sort | **TEMP B-TREE** (materializes ALL rows) | None (index order) |
|
||||||
|
| Memory | Proportional to total matching rows | Constant |
|
||||||
|
| I/O | Full scan of all matching rows | Stops at OFFSET+LIMIT |
|
||||||
|
|
||||||
|
### Bug 3: `IN` clause + `ORDER BY` is fundamentally incompatible with composite indexes
|
||||||
|
|
||||||
|
Even with the "correct" index `(job_state, start_time)`, SQLite **cannot** merge-sort across 3 separate index range scans for `IN ('completed','running','failed')`. It always falls back to a temp B-tree sort. The only efficient plan is to use the standalone `jobs_starttime` index — which SQLite does automatically **after ANALYZE** because it realizes the 3 states cover virtually all rows, making the WHERE clause nearly a no-op.
|
||||||
|
|
||||||
|
### Observation: 79 indexes on the `job` table
|
||||||
|
This is excessive and actively harmful — it confuses the query planner (especially without ANALYZE) and slows writes. The `jobs_jobstate_starttime` index from migration 08 is also missing from the actual DB (only the 3-column `jobs_jobstate_starttime_duration` exists). This is worth investigating separately but is a schema/migration concern, not a code bug.
|
||||||
412
.entire/metadata/42401d2e-7d1c-4c0e-abe6-356cb2d48747/full.jsonl
Normal file
412
.entire/metadata/42401d2e-7d1c-4c0e-abe6-356cb2d48747/full.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -0,0 +1 @@
|
|||||||
|
What optimal index selection do you suggest? The job table contains almost 20 million jobs in production causing timouts on any filter query that does not use and index.
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
|
||||||
|
|
||||||
|
The plan reduces 79 indexes to 20 by:
|
||||||
|
|
||||||
|
- **Eliminating entire categories**: all `shared`-prefixed (16), all user/project sort variants (11), all standalone single-filter sorts (11)
|
||||||
|
- **Consolidating**: cluster+jobstate sort variants replaced by `(cluster, sort_col)` which works for any state combo with LIMIT
|
||||||
|
- **Keeping what matters**: the indexes SQLite actually picks with ANALYZE — `jobs_starttime`, `jobs_cluster_starttime_duration`, `jobs_user_starttime_duration`, `jobs_cluster_jobstate_duration_starttime`, and a few secondary sort indexes
|
||||||
|
|
||||||
|
Key trade-off: ~20% of queries that sort by rare columns (num_hwthreads, num_acc, energy) with a state filter will now do a cheap per-row state check instead of using a 3-column composite. With LIMIT this is negligible.
|
||||||
4
.entire/settings.json
Normal file
4
.entire/settings.json
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
{
|
||||||
|
"enabled": true,
|
||||||
|
"telemetry": true
|
||||||
|
}
|
||||||
331
.github/workflows/Release.yml
vendored
331
.github/workflows/Release.yml
vendored
@@ -1,331 +0,0 @@
|
|||||||
# See: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions
|
|
||||||
|
|
||||||
# Workflow name
|
|
||||||
name: Release
|
|
||||||
|
|
||||||
# Run on tag push
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
tags:
|
|
||||||
- '**'
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
|
|
||||||
#
|
|
||||||
# Build on AlmaLinux 8.5 using golang-1.18.2
|
|
||||||
#
|
|
||||||
AlmaLinux-RPM-build:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
# See: https://hub.docker.com/_/almalinux
|
|
||||||
container: almalinux:8.5
|
|
||||||
# The job outputs link to the outputs of the 'rpmrename' step
|
|
||||||
# Only job outputs can be used in child jobs
|
|
||||||
outputs:
|
|
||||||
rpm : ${{steps.rpmrename.outputs.RPM}}
|
|
||||||
srpm : ${{steps.rpmrename.outputs.SRPM}}
|
|
||||||
steps:
|
|
||||||
|
|
||||||
# Use dnf to install development packages
|
|
||||||
- name: Install development packages
|
|
||||||
run: |
|
|
||||||
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
|
|
||||||
dnf --assumeyes install wget openssl-devel diffutils delve which npm
|
|
||||||
dnf --assumeyes install 'dnf-command(builddep)'
|
|
||||||
|
|
||||||
# Checkout git repository and submodules
|
|
||||||
# fetch-depth must be 0 to use git describe
|
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
# Use dnf to install build dependencies
|
|
||||||
- name: Install build dependencies
|
|
||||||
run: |
|
|
||||||
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
|
||||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
|
||||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
|
|
||||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
|
|
||||||
rpm -i go*.rpm
|
|
||||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
|
||||||
#dnf --assumeyes builddep build/package/cc-backend.spec
|
|
||||||
|
|
||||||
- name: RPM build ClusterCockpit
|
|
||||||
id: rpmbuild
|
|
||||||
run: make RPM
|
|
||||||
|
|
||||||
# AlmaLinux 8.5 is a derivate of RedHat Enterprise Linux 8 (UBI8),
|
|
||||||
# so the created RPM both contain the substring 'el8' in the RPM file names
|
|
||||||
# This step replaces the substring 'el8' to 'alma85'. It uses the move operation
|
|
||||||
# because it is unclear whether the default AlmaLinux 8.5 container contains the
|
|
||||||
# 'rename' command. This way we also get the new names for output.
|
|
||||||
- name: Rename RPMs (s/el8/alma85/)
|
|
||||||
id: rpmrename
|
|
||||||
run: |
|
|
||||||
OLD_RPM="${{steps.rpmbuild.outputs.RPM}}"
|
|
||||||
OLD_SRPM="${{steps.rpmbuild.outputs.SRPM}}"
|
|
||||||
NEW_RPM="${OLD_RPM/el8/alma85}"
|
|
||||||
NEW_SRPM=${OLD_SRPM/el8/alma85}
|
|
||||||
mv "${OLD_RPM}" "${NEW_RPM}"
|
|
||||||
mv "${OLD_SRPM}" "${NEW_SRPM}"
|
|
||||||
echo "::set-output name=SRPM::${NEW_SRPM}"
|
|
||||||
echo "::set-output name=RPM::${NEW_RPM}"
|
|
||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
|
||||||
- name: Save RPM as artifact
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend RPM for AlmaLinux 8.5
|
|
||||||
path: ${{ steps.rpmrename.outputs.RPM }}
|
|
||||||
- name: Save SRPM as artifact
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend SRPM for AlmaLinux 8.5
|
|
||||||
path: ${{ steps.rpmrename.outputs.SRPM }}
|
|
||||||
|
|
||||||
#
|
|
||||||
# Build on UBI 8 using golang-1.18.2
|
|
||||||
#
|
|
||||||
UBI-8-RPM-build:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
|
|
||||||
container: registry.access.redhat.com/ubi8/ubi:8.5-226.1645809065
|
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
|
||||||
outputs:
|
|
||||||
rpm : ${{steps.rpmbuild.outputs.RPM}}
|
|
||||||
srpm : ${{steps.rpmbuild.outputs.SRPM}}
|
|
||||||
steps:
|
|
||||||
|
|
||||||
# Use dnf to install development packages
|
|
||||||
- name: Install development packages
|
|
||||||
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
|
|
||||||
|
|
||||||
# Checkout git repository and submodules
|
|
||||||
# fetch-depth must be 0 to use git describe
|
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
# Use dnf to install build dependencies
|
|
||||||
- name: Install build dependencies
|
|
||||||
run: |
|
|
||||||
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
|
||||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
|
||||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
|
|
||||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
|
|
||||||
rpm -i go*.rpm
|
|
||||||
dnf --assumeyes --disableplugin=subscription-manager install npm
|
|
||||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
|
||||||
#dnf --assumeyes builddep build/package/cc-backend.spec
|
|
||||||
|
|
||||||
- name: RPM build ClusterCockpit
|
|
||||||
id: rpmbuild
|
|
||||||
run: make RPM
|
|
||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
|
||||||
- name: Save RPM as artifact
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend RPM for UBI 8
|
|
||||||
path: ${{ steps.rpmbuild.outputs.RPM }}
|
|
||||||
- name: Save SRPM as artifact
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend SRPM for UBI 8
|
|
||||||
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
|
||||||
|
|
||||||
#
|
|
||||||
# Build on Ubuntu 20.04 using official go 1.19.1 package
|
|
||||||
#
|
|
||||||
Ubuntu-focal-build:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
container: ubuntu:20.04
|
|
||||||
# The job outputs link to the outputs of the 'debrename' step
|
|
||||||
# Only job outputs can be used in child jobs
|
|
||||||
outputs:
|
|
||||||
deb : ${{steps.debrename.outputs.DEB}}
|
|
||||||
steps:
|
|
||||||
# Use apt to install development packages
|
|
||||||
- name: Install development packages
|
|
||||||
run: |
|
|
||||||
apt update && apt --assume-yes upgrade
|
|
||||||
apt --assume-yes install build-essential sed git wget bash
|
|
||||||
apt --assume-yes install npm
|
|
||||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
|
||||||
# Checkout git repository and submodules
|
|
||||||
# fetch-depth must be 0 to use git describe
|
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
fetch-depth: 0
|
|
||||||
# Use official golang package
|
|
||||||
- name: Install Golang
|
|
||||||
run: |
|
|
||||||
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
|
|
||||||
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
|
|
||||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
|
||||||
go version
|
|
||||||
- name: DEB build ClusterCockpit
|
|
||||||
id: dpkg-build
|
|
||||||
run: |
|
|
||||||
ls -la
|
|
||||||
pwd
|
|
||||||
env
|
|
||||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
|
||||||
git config --global --add safe.directory $(pwd)
|
|
||||||
make DEB
|
|
||||||
- name: Rename DEB (add '_ubuntu20.04')
|
|
||||||
id: debrename
|
|
||||||
run: |
|
|
||||||
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
|
|
||||||
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb"
|
|
||||||
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
|
|
||||||
echo "::set-output name=DEB::${NEW_DEB_FILE}"
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
|
||||||
- name: Save DEB as artifact
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend DEB for Ubuntu 20.04
|
|
||||||
path: ${{ steps.debrename.outputs.DEB }}
|
|
||||||
|
|
||||||
#
|
|
||||||
# Build on Ubuntu 20.04 using official go 1.19.1 package
|
|
||||||
#
|
|
||||||
Ubuntu-jammy-build:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
container: ubuntu:22.04
|
|
||||||
# The job outputs link to the outputs of the 'debrename' step
|
|
||||||
# Only job outputs can be used in child jobs
|
|
||||||
outputs:
|
|
||||||
deb : ${{steps.debrename.outputs.DEB}}
|
|
||||||
steps:
|
|
||||||
# Use apt to install development packages
|
|
||||||
- name: Install development packages
|
|
||||||
run: |
|
|
||||||
apt update && apt --assume-yes upgrade
|
|
||||||
apt --assume-yes install build-essential sed git wget bash npm
|
|
||||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
|
||||||
# Checkout git repository and submodules
|
|
||||||
# fetch-depth must be 0 to use git describe
|
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
fetch-depth: 0
|
|
||||||
# Use official golang package
|
|
||||||
- name: Install Golang
|
|
||||||
run: |
|
|
||||||
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
|
|
||||||
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
|
|
||||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
|
||||||
go version
|
|
||||||
- name: DEB build ClusterCockpit
|
|
||||||
id: dpkg-build
|
|
||||||
run: |
|
|
||||||
ls -la
|
|
||||||
pwd
|
|
||||||
env
|
|
||||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
|
||||||
git config --global --add safe.directory $(pwd)
|
|
||||||
make DEB
|
|
||||||
- name: Rename DEB (add '_ubuntu22.04')
|
|
||||||
id: debrename
|
|
||||||
run: |
|
|
||||||
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
|
|
||||||
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu22.04.deb"
|
|
||||||
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
|
|
||||||
echo "::set-output name=DEB::${NEW_DEB_FILE}"
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
|
||||||
- name: Save DEB as artifact
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend DEB for Ubuntu 22.04
|
|
||||||
path: ${{ steps.debrename.outputs.DEB }}
|
|
||||||
|
|
||||||
#
|
|
||||||
# Create release with fresh RPMs
|
|
||||||
#
|
|
||||||
Release:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
# We need the RPMs, so add dependency
|
|
||||||
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build, Ubuntu-jammy-build]
|
|
||||||
|
|
||||||
steps:
|
|
||||||
# See: https://github.com/actions/download-artifact
|
|
||||||
- name: Download AlmaLinux 8.5 RPM
|
|
||||||
uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend RPM for AlmaLinux 8.5
|
|
||||||
- name: Download AlmaLinux 8.5 SRPM
|
|
||||||
uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend SRPM for AlmaLinux 8.5
|
|
||||||
|
|
||||||
- name: Download UBI 8 RPM
|
|
||||||
uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend RPM for UBI 8
|
|
||||||
- name: Download UBI 8 SRPM
|
|
||||||
uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend SRPM for UBI 8
|
|
||||||
|
|
||||||
- name: Download Ubuntu 20.04 DEB
|
|
||||||
uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend DEB for Ubuntu 20.04
|
|
||||||
|
|
||||||
- name: Download Ubuntu 22.04 DEB
|
|
||||||
uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: cc-backend DEB for Ubuntu 22.04
|
|
||||||
|
|
||||||
# The download actions do not publish the name of the downloaded file,
|
|
||||||
# so we re-use the job outputs of the parent jobs. The files are all
|
|
||||||
# downloaded to the current folder.
|
|
||||||
# The gh-release action afterwards does not accept file lists but all
|
|
||||||
# files have to be listed at 'files'. The step creates one output per
|
|
||||||
# RPM package (2 per distro)
|
|
||||||
- name: Set RPM variables
|
|
||||||
id: files
|
|
||||||
run: |
|
|
||||||
ALMA_85_RPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.rpm}}")
|
|
||||||
ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}")
|
|
||||||
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
|
|
||||||
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
|
|
||||||
U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}")
|
|
||||||
U_2204_DEB=$(basename "${{ needs.Ubuntu-jammy-build.outputs.deb}}")
|
|
||||||
echo "ALMA_85_RPM::${ALMA_85_RPM}"
|
|
||||||
echo "ALMA_85_SRPM::${ALMA_85_SRPM}"
|
|
||||||
echo "UBI_8_RPM::${UBI_8_RPM}"
|
|
||||||
echo "UBI_8_SRPM::${UBI_8_SRPM}"
|
|
||||||
echo "U_2004_DEB::${U_2004_DEB}"
|
|
||||||
echo "U_2204_DEB::${U_2204_DEB}"
|
|
||||||
echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}"
|
|
||||||
echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}"
|
|
||||||
echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}"
|
|
||||||
echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}"
|
|
||||||
echo "::set-output name=U_2004_DEB::${U_2004_DEB}"
|
|
||||||
echo "::set-output name=U_2204_DEB::${U_2204_DEB}"
|
|
||||||
|
|
||||||
# See: https://github.com/softprops/action-gh-release
|
|
||||||
- name: Release
|
|
||||||
uses: softprops/action-gh-release@v1
|
|
||||||
if: startsWith(github.ref, 'refs/tags/')
|
|
||||||
with:
|
|
||||||
name: cc-backend-${{github.ref_name}}
|
|
||||||
files: |
|
|
||||||
${{ steps.files.outputs.ALMA_85_RPM }}
|
|
||||||
${{ steps.files.outputs.ALMA_85_SRPM }}
|
|
||||||
${{ steps.files.outputs.UBI_8_RPM }}
|
|
||||||
${{ steps.files.outputs.UBI_8_SRPM }}
|
|
||||||
${{ steps.files.outputs.U_2004_DEB }}
|
|
||||||
${{ steps.files.outputs.U_2204_DEB }}
|
|
||||||
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -7,7 +7,7 @@ jobs:
|
|||||||
- name: Install Go
|
- name: Install Go
|
||||||
uses: actions/setup-go@v4
|
uses: actions/setup-go@v4
|
||||||
with:
|
with:
|
||||||
go-version: 1.20.x
|
go-version: 1.25.x
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
- name: Build, Vet & Test
|
- name: Build, Vet & Test
|
||||||
|
|||||||
29
.gitignore
vendored
29
.gitignore
vendored
@@ -1,19 +1,32 @@
|
|||||||
/cc-backend
|
/cc-backend
|
||||||
|
|
||||||
/var/job-archive
|
|
||||||
/var/*.db
|
|
||||||
/var/machine-state
|
|
||||||
|
|
||||||
/.env
|
/.env
|
||||||
/config.json
|
/config.json
|
||||||
|
/uiConfig.json
|
||||||
|
|
||||||
|
/var/job-archive
|
||||||
|
/var/machine-state
|
||||||
|
/var/*.db-shm
|
||||||
|
/var/*.db-wal
|
||||||
|
/var/*.db
|
||||||
|
/var/*.txt
|
||||||
|
|
||||||
|
/var/checkpoints*
|
||||||
|
|
||||||
|
migrateTimestamps.pl
|
||||||
|
test_ccms_*
|
||||||
|
|
||||||
/web/frontend/public/build
|
/web/frontend/public/build
|
||||||
/web/frontend/node_modules
|
/web/frontend/node_modules
|
||||||
/.vscode/*
|
|
||||||
/archive-migration
|
/archive-migration
|
||||||
/archive-manager
|
/archive-manager
|
||||||
var/job.db-shm
|
|
||||||
var/job.db-wal
|
|
||||||
|
|
||||||
|
/internal/repository/testdata/job.db-shm
|
||||||
|
/internal/repository/testdata/job.db-wal
|
||||||
|
|
||||||
|
/.vscode/*
|
||||||
dist/
|
dist/
|
||||||
*.db
|
*.db
|
||||||
|
.idea
|
||||||
|
tools/archive-migration/archive-migration
|
||||||
|
tools/archive-manager/archive-manager
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
version: 2
|
||||||
before:
|
before:
|
||||||
hooks:
|
hooks:
|
||||||
- go mod tidy
|
- go mod tidy
|
||||||
@@ -61,7 +62,7 @@ builds:
|
|||||||
tags:
|
tags:
|
||||||
- static_build
|
- static_build
|
||||||
archives:
|
archives:
|
||||||
- format: tar.gz
|
- formats: tar.gz
|
||||||
# this name template makes the OS and Arch compatible with the results of uname.
|
# this name template makes the OS and Arch compatible with the results of uname.
|
||||||
name_template: >-
|
name_template: >-
|
||||||
{{ .ProjectName }}_
|
{{ .ProjectName }}_
|
||||||
@@ -70,9 +71,9 @@ archives:
|
|||||||
{{- else }}{{ .Arch }}{{ end }}
|
{{- else }}{{ .Arch }}{{ end }}
|
||||||
{{- if .Arm }}v{{ .Arm }}{{ end }}
|
{{- if .Arm }}v{{ .Arm }}{{ end }}
|
||||||
checksum:
|
checksum:
|
||||||
name_template: 'checksums.txt'
|
name_template: "checksums.txt"
|
||||||
snapshot:
|
snapshot:
|
||||||
name_template: "{{ incpatch .Version }}-next"
|
version_template: "{{ incpatch .Version }}-next"
|
||||||
changelog:
|
changelog:
|
||||||
sort: asc
|
sort: asc
|
||||||
filters:
|
filters:
|
||||||
@@ -100,7 +101,7 @@ changelog:
|
|||||||
release:
|
release:
|
||||||
draft: false
|
draft: false
|
||||||
footer: |
|
footer: |
|
||||||
Supports job archive version 1 and database version 6.
|
Supports job archive version 3 and database version 10.
|
||||||
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md) for further details on breaking changes.
|
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md) for further details on breaking changes.
|
||||||
|
|
||||||
# vim: set ts=2 sw=2 tw=0 fo=cnqoj
|
# vim: set ts=2 sw=2 tw=0 fo=cnqoj
|
||||||
|
|||||||
26
AGENTS.md
Normal file
26
AGENTS.md
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# ClusterCockpit Backend - Agent Guidelines
|
||||||
|
|
||||||
|
## Build/Test Commands
|
||||||
|
|
||||||
|
- Build: `make` or `go build ./cmd/cc-backend`
|
||||||
|
- Run all tests: `make test` (runs: `go clean -testcache && go build ./... && go vet ./... && go test ./...`)
|
||||||
|
- Run single test: `go test -run TestName ./path/to/package`
|
||||||
|
- Run single test file: `go test ./path/to/package -run TestName`
|
||||||
|
- Frontend build: `cd web/frontend && npm install && npm run build`
|
||||||
|
- Generate GraphQL: `make graphql` (uses gqlgen)
|
||||||
|
- Generate Swagger: `make swagger` (uses swaggo/swag)
|
||||||
|
|
||||||
|
## Code Style
|
||||||
|
|
||||||
|
- **Formatting**: Use `gofumpt` for all Go files (strict requirement)
|
||||||
|
- **Copyright header**: All files must include copyright header (see existing files)
|
||||||
|
- **Package docs**: Document packages with comprehensive package-level comments explaining purpose, usage, configuration
|
||||||
|
- **Imports**: Standard library first, then external packages, then internal packages (grouped with blank lines)
|
||||||
|
- **Naming**: Use camelCase for private, PascalCase for exported; descriptive names (e.g., `JobRepository`, `handleError`)
|
||||||
|
- **Error handling**: Return errors, don't panic; use custom error types where appropriate; log with cclog package
|
||||||
|
- **Logging**: Use `cclog` package (e.g., `cclog.Errorf()`, `cclog.Warnf()`, `cclog.Debugf()`)
|
||||||
|
- **Testing**: Use standard `testing` package; use `testify/assert` for assertions; name tests `TestFunctionName`
|
||||||
|
- **Comments**: Document all exported functions/types with godoc-style comments
|
||||||
|
- **Structs**: Document fields with inline comments, especially for complex configurations
|
||||||
|
- **HTTP handlers**: Return proper status codes; use `handleError()` helper for consistent error responses
|
||||||
|
- **JSON**: Use struct tags for JSON marshaling; `DisallowUnknownFields()` for strict decoding
|
||||||
306
CLAUDE.md
Normal file
306
CLAUDE.md
Normal file
@@ -0,0 +1,306 @@
|
|||||||
|
# CLAUDE.md
|
||||||
|
|
||||||
|
This file provides guidance to Claude Code (claude.ai/code) when working with
|
||||||
|
code in this repository.
|
||||||
|
|
||||||
|
## Project Overview
|
||||||
|
|
||||||
|
ClusterCockpit is a job-specific performance monitoring framework for HPC
|
||||||
|
clusters. This is a Golang backend that provides REST and GraphQL APIs, serves a
|
||||||
|
Svelte-based frontend, and manages job archives and metric data from various
|
||||||
|
time-series databases.
|
||||||
|
|
||||||
|
## Build and Development Commands
|
||||||
|
|
||||||
|
### Building
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build everything (frontend + backend)
|
||||||
|
make
|
||||||
|
|
||||||
|
# Build only the frontend
|
||||||
|
make frontend
|
||||||
|
|
||||||
|
# Build only the backend (requires frontend to be built first)
|
||||||
|
go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.5.0 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run all tests
|
||||||
|
make test
|
||||||
|
|
||||||
|
# Run tests with verbose output
|
||||||
|
go test -v ./...
|
||||||
|
|
||||||
|
# Run tests for a specific package
|
||||||
|
go test ./internal/repository
|
||||||
|
```
|
||||||
|
|
||||||
|
### Code Generation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Regenerate GraphQL schema and resolvers (after modifying api/schema.graphqls)
|
||||||
|
make graphql
|
||||||
|
|
||||||
|
# Regenerate Swagger/OpenAPI docs (after modifying API comments)
|
||||||
|
make swagger
|
||||||
|
```
|
||||||
|
|
||||||
|
### Frontend Development
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd web/frontend
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
npm install
|
||||||
|
|
||||||
|
# Build for production
|
||||||
|
npm run build
|
||||||
|
|
||||||
|
# Development mode with watch
|
||||||
|
npm run dev
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Initialize database and create admin user
|
||||||
|
./cc-backend -init-db -add-user demo:admin:demo
|
||||||
|
|
||||||
|
# Start server in development mode (enables GraphQL Playground and Swagger UI)
|
||||||
|
./cc-backend -server -dev -loglevel info
|
||||||
|
|
||||||
|
# Start demo with sample data
|
||||||
|
./startDemo.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Backend Structure
|
||||||
|
|
||||||
|
The backend follows a layered architecture with clear separation of concerns:
|
||||||
|
|
||||||
|
- **cmd/cc-backend**: Entry point, orchestrates initialization of all subsystems
|
||||||
|
- **internal/repository**: Data access layer using repository pattern
|
||||||
|
- Abstracts database operations (SQLite3 only)
|
||||||
|
- Implements LRU caching for performance
|
||||||
|
- Provides repositories for Job, User, Node, and Tag entities
|
||||||
|
- Transaction support for batch operations
|
||||||
|
- **internal/api**: REST API endpoints (Swagger/OpenAPI documented)
|
||||||
|
- **internal/graph**: GraphQL API (uses gqlgen)
|
||||||
|
- Schema in `api/schema.graphqls`
|
||||||
|
- Generated code in `internal/graph/generated/`
|
||||||
|
- Resolvers in `internal/graph/schema.resolvers.go`
|
||||||
|
- **internal/auth**: Authentication layer
|
||||||
|
- Supports local accounts, LDAP, OIDC, and JWT tokens
|
||||||
|
- Implements rate limiting for login attempts
|
||||||
|
- **pkg/metricstore**: Metric store with data loading API
|
||||||
|
- In-memory metric storage with checkpointing
|
||||||
|
- Query API for loading job metric data
|
||||||
|
- **internal/archiver**: Job archiving to file-based archive
|
||||||
|
- **internal/api/nats.go**: NATS-based API for job and node operations
|
||||||
|
- Subscribes to NATS subjects for job events (start/stop)
|
||||||
|
- Handles node state updates via NATS
|
||||||
|
- Uses InfluxDB line protocol message format
|
||||||
|
- **pkg/archive**: Job archive backend implementations
|
||||||
|
- File system backend (default)
|
||||||
|
- S3 backend
|
||||||
|
- SQLite backend (experimental)
|
||||||
|
- **parquet** sub-package: Parquet format support (schema, reader, writer, conversion)
|
||||||
|
- **internal/metricstoreclient**: Client for cc-metric-store queries
|
||||||
|
|
||||||
|
### Frontend Structure
|
||||||
|
|
||||||
|
- **web/frontend**: Svelte 5 application
|
||||||
|
- Uses Rollup for building
|
||||||
|
- Components organized by feature (analysis, job, user, etc.)
|
||||||
|
- GraphQL client using @urql/svelte
|
||||||
|
- Bootstrap 5 + SvelteStrap for UI
|
||||||
|
- uPlot for time-series visualization
|
||||||
|
- **web/templates**: Server-side Go templates
|
||||||
|
|
||||||
|
### Key Concepts
|
||||||
|
|
||||||
|
**Job Archive**: Completed jobs are stored in a file-based archive following the
|
||||||
|
[ClusterCockpit job-archive
|
||||||
|
specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
|
||||||
|
Each job has a `meta.json` file with metadata and metric data files.
|
||||||
|
|
||||||
|
**Metric Data Repositories**: Time-series metric data is stored separately from
|
||||||
|
job metadata. The system supports multiple backends (cc-metric-store is
|
||||||
|
recommended). Configuration is per-cluster in `config.json`.
|
||||||
|
|
||||||
|
**Authentication Flow**:
|
||||||
|
|
||||||
|
1. Multiple authenticators can be configured (local, LDAP, OIDC, JWT)
|
||||||
|
2. Each authenticator's `CanLogin` method is called to determine if it should handle the request
|
||||||
|
3. The first authenticator that returns true performs the actual `Login`
|
||||||
|
4. JWT tokens are used for API authentication
|
||||||
|
|
||||||
|
**Database Migrations**: SQL migrations in `internal/repository/migrations/sqlite3/` are
|
||||||
|
applied automatically on startup. Version tracking in `version` table.
|
||||||
|
|
||||||
|
**Scopes**: Metrics can be collected at different scopes:
|
||||||
|
|
||||||
|
- Node scope (always available)
|
||||||
|
- Core scope (for jobs with ≤8 nodes)
|
||||||
|
- Accelerator scope (for GPU/accelerator metrics)
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
- **config.json**: Main configuration (clusters, metric repositories, archive settings)
|
||||||
|
- `main.apiSubjects`: NATS subject configuration (optional)
|
||||||
|
- `subjectJobEvent`: Subject for job start/stop events (e.g., "cc.job.event")
|
||||||
|
- `subjectNodeState`: Subject for node state updates (e.g., "cc.node.state")
|
||||||
|
- `nats`: NATS client connection configuration (optional)
|
||||||
|
- `address`: NATS server address (e.g., "nats://localhost:4222")
|
||||||
|
- `username`: Authentication username (optional)
|
||||||
|
- `password`: Authentication password (optional)
|
||||||
|
- `creds-file-path`: Path to NATS credentials file (optional)
|
||||||
|
- **.env**: Environment variables (secrets like JWT keys)
|
||||||
|
- Copy from `configs/env-template.txt`
|
||||||
|
- NEVER commit this file
|
||||||
|
- **cluster.json**: Cluster topology and metric definitions (loaded from archive or config)
|
||||||
|
|
||||||
|
## Database
|
||||||
|
|
||||||
|
- Default: SQLite 3 (`./var/job.db`)
|
||||||
|
- Connection managed by `internal/repository`
|
||||||
|
- Schema version in `internal/repository/migration.go`
|
||||||
|
|
||||||
|
## Code Generation
|
||||||
|
|
||||||
|
**GraphQL** (gqlgen):
|
||||||
|
|
||||||
|
- Schema: `api/schema.graphqls`
|
||||||
|
- Config: `gqlgen.yml`
|
||||||
|
- Generated code: `internal/graph/generated/`
|
||||||
|
- Custom resolvers: `internal/graph/schema.resolvers.go`
|
||||||
|
- Run `make graphql` after schema changes
|
||||||
|
|
||||||
|
**Swagger/OpenAPI**:
|
||||||
|
|
||||||
|
- Annotations in `internal/api/*.go`
|
||||||
|
- Generated docs: `internal/api/docs.go`, `api/swagger.yaml`
|
||||||
|
- Run `make swagger` after API changes
|
||||||
|
|
||||||
|
## Testing Conventions
|
||||||
|
|
||||||
|
- Test files use `_test.go` suffix
|
||||||
|
- Test data in `testdata/` subdirectories
|
||||||
|
- Repository tests use in-memory SQLite
|
||||||
|
- API tests use httptest
|
||||||
|
|
||||||
|
## Common Workflows
|
||||||
|
|
||||||
|
### Adding a new GraphQL field
|
||||||
|
|
||||||
|
1. Edit schema in `api/schema.graphqls`
|
||||||
|
2. Run `make graphql`
|
||||||
|
3. Implement resolver in `internal/graph/schema.resolvers.go`
|
||||||
|
|
||||||
|
### Adding a new REST endpoint
|
||||||
|
|
||||||
|
1. Add handler in `internal/api/*.go`
|
||||||
|
2. Add route in `internal/api/rest.go`
|
||||||
|
3. Add Swagger annotations
|
||||||
|
4. Run `make swagger`
|
||||||
|
|
||||||
|
### Adding a new metric data backend
|
||||||
|
|
||||||
|
1. Implement metric loading functions in `pkg/metricstore/query.go`
|
||||||
|
2. Add cluster configuration to metric store initialization
|
||||||
|
3. Update config.json schema documentation
|
||||||
|
|
||||||
|
### Modifying database schema
|
||||||
|
|
||||||
|
1. Create new migration in `internal/repository/migrations/sqlite3/`
|
||||||
|
2. Increment `repository.Version`
|
||||||
|
3. Test with fresh database and existing database
|
||||||
|
|
||||||
|
## NATS API
|
||||||
|
|
||||||
|
The backend supports a NATS-based API as an alternative to the REST API for job and node operations.
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
1. Configure NATS client connection in `config.json`:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"nats": {
|
||||||
|
"address": "nats://localhost:4222",
|
||||||
|
"username": "user",
|
||||||
|
"password": "pass"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Configure API subjects in `config.json` under `main`:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"main": {
|
||||||
|
"apiSubjects": {
|
||||||
|
"subjectJobEvent": "cc.job.event",
|
||||||
|
"subjectNodeState": "cc.node.state"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Message Format
|
||||||
|
|
||||||
|
Messages use **InfluxDB line protocol** format with the following structure:
|
||||||
|
|
||||||
|
#### Job Events
|
||||||
|
|
||||||
|
**Start Job:**
|
||||||
|
```
|
||||||
|
job,function=start_job event="{\"jobId\":123,\"user\":\"alice\",\"cluster\":\"test\", ...}" 1234567890000000000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Stop Job:**
|
||||||
|
```
|
||||||
|
job,function=stop_job event="{\"jobId\":123,\"cluster\":\"test\",\"startTime\":1234567890,\"stopTime\":1234571490,\"jobState\":\"completed\"}" 1234571490000000000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Tags:**
|
||||||
|
- `function`: Either `start_job` or `stop_job`
|
||||||
|
|
||||||
|
**Fields:**
|
||||||
|
- `event`: JSON payload containing job data (see REST API documentation for schema)
|
||||||
|
|
||||||
|
#### Node State Updates
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"nodes": [
|
||||||
|
{
|
||||||
|
"hostname": "node001",
|
||||||
|
"states": ["allocated"],
|
||||||
|
"cpusAllocated": 8,
|
||||||
|
"memoryAllocated": 16384,
|
||||||
|
"gpusAllocated": 0,
|
||||||
|
"jobsRunning": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Implementation Notes
|
||||||
|
|
||||||
|
- NATS API mirrors REST API functionality but uses messaging
|
||||||
|
- Job start/stop events are processed asynchronously
|
||||||
|
- Duplicate job detection is handled (same as REST API)
|
||||||
|
- All validation rules from REST API apply
|
||||||
|
- Messages are logged; no responses are sent back to publishers
|
||||||
|
- If NATS client is unavailable, API subscriptions are skipped (logged as warning)
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- Go 1.24.0+ (check go.mod for exact version)
|
||||||
|
- Node.js (for frontend builds)
|
||||||
|
- SQLite 3 (only supported database)
|
||||||
|
- Optional: NATS server for NATS API integration
|
||||||
53
Makefile
53
Makefile
@@ -1,8 +1,6 @@
|
|||||||
TARGET = ./cc-backend
|
TARGET = ./cc-backend
|
||||||
VAR = ./var
|
|
||||||
CFG = config.json .env
|
|
||||||
FRONTEND = ./web/frontend
|
FRONTEND = ./web/frontend
|
||||||
VERSION = 1.3.1
|
VERSION = 1.5.0
|
||||||
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
||||||
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
||||||
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
||||||
@@ -22,23 +20,42 @@ SVELTE_COMPONENTS = status \
|
|||||||
header
|
header
|
||||||
|
|
||||||
SVELTE_TARGETS = $(addprefix $(FRONTEND)/public/build/,$(addsuffix .js, $(SVELTE_COMPONENTS)))
|
SVELTE_TARGETS = $(addprefix $(FRONTEND)/public/build/,$(addsuffix .js, $(SVELTE_COMPONENTS)))
|
||||||
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
|
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
|
||||||
$(wildcard $(FRONTEND)/src/*.js) \
|
$(wildcard $(FRONTEND)/src/*.js) \
|
||||||
$(wildcard $(FRONTEND)/src/filters/*.svelte) \
|
$(wildcard $(FRONTEND)/src/analysis/*.svelte) \
|
||||||
$(wildcard $(FRONTEND)/src/plots/*.svelte) \
|
$(wildcard $(FRONTEND)/src/config/*.svelte) \
|
||||||
$(wildcard $(FRONTEND)/src/joblist/*.svelte)
|
$(wildcard $(FRONTEND)/src/config/admin/*.svelte) \
|
||||||
|
$(wildcard $(FRONTEND)/src/config/user/*.svelte) \
|
||||||
|
$(wildcard $(FRONTEND)/src/generic/*.js) \
|
||||||
|
$(wildcard $(FRONTEND)/src/generic/*.svelte) \
|
||||||
|
$(wildcard $(FRONTEND)/src/generic/filters/*.svelte) \
|
||||||
|
$(wildcard $(FRONTEND)/src/generic/plots/*.svelte) \
|
||||||
|
$(wildcard $(FRONTEND)/src/generic/joblist/*.svelte) \
|
||||||
|
$(wildcard $(FRONTEND)/src/generic/helper/*.svelte) \
|
||||||
|
$(wildcard $(FRONTEND)/src/generic/select/*.svelte) \
|
||||||
|
$(wildcard $(FRONTEND)/src/header/*.svelte) \
|
||||||
|
$(wildcard $(FRONTEND)/src/job/*.svelte)
|
||||||
|
|
||||||
.PHONY: clean distclean test tags frontend $(TARGET)
|
.PHONY: clean distclean test tags frontend swagger graphql $(TARGET)
|
||||||
|
|
||||||
.NOTPARALLEL:
|
.NOTPARALLEL:
|
||||||
|
|
||||||
$(TARGET): $(VAR) $(CFG) $(SVELTE_TARGETS)
|
$(TARGET): $(SVELTE_TARGETS)
|
||||||
$(info ===> BUILD cc-backend)
|
$(info ===> BUILD cc-backend)
|
||||||
@go build -ldflags=${LD_FLAGS} ./cmd/cc-backend
|
@go build -ldflags=${LD_FLAGS} ./cmd/cc-backend
|
||||||
|
|
||||||
frontend:
|
frontend:
|
||||||
$(info ===> BUILD frontend)
|
$(info ===> BUILD frontend)
|
||||||
cd web/frontend && npm install && npm run build
|
cd web/frontend && npm ci && npm run build
|
||||||
|
|
||||||
|
swagger:
|
||||||
|
$(info ===> GENERATE swagger)
|
||||||
|
@go tool github.com/swaggo/swag/cmd/swag init --parseDependency -d ./internal/api -g rest.go -o ./api
|
||||||
|
@mv ./api/docs.go ./internal/api/docs.go
|
||||||
|
|
||||||
|
graphql:
|
||||||
|
$(info ===> GENERATE graphql)
|
||||||
|
@go tool github.com/99designs/gqlgen
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(info ===> CLEAN)
|
$(info ===> CLEAN)
|
||||||
@@ -49,7 +66,7 @@ distclean:
|
|||||||
@$(MAKE) clean
|
@$(MAKE) clean
|
||||||
$(info ===> DISTCLEAN)
|
$(info ===> DISTCLEAN)
|
||||||
@rm -rf $(FRONTEND)/node_modules
|
@rm -rf $(FRONTEND)/node_modules
|
||||||
@rm -rf $(VAR)
|
@rm -rf ./var
|
||||||
|
|
||||||
test:
|
test:
|
||||||
$(info ===> TESTING)
|
$(info ===> TESTING)
|
||||||
@@ -63,16 +80,8 @@ tags:
|
|||||||
@ctags -R
|
@ctags -R
|
||||||
|
|
||||||
$(VAR):
|
$(VAR):
|
||||||
@mkdir $(VAR)
|
@mkdir -p $(VAR)
|
||||||
|
|
||||||
config.json:
|
|
||||||
$(info ===> Initialize config.json file)
|
|
||||||
@cp configs/config.json config.json
|
|
||||||
|
|
||||||
.env:
|
|
||||||
$(info ===> Initialize .env file)
|
|
||||||
@cp configs/env-template.txt .env
|
|
||||||
|
|
||||||
$(SVELTE_TARGETS): $(SVELTE_SRC)
|
$(SVELTE_TARGETS): $(SVELTE_SRC)
|
||||||
$(info ===> BUILD frontend)
|
$(info ===> BUILD frontend)
|
||||||
cd web/frontend && npm install && npm run build
|
cd web/frontend && npm ci && npm run build
|
||||||
|
|||||||
237
README.md
237
README.md
@@ -1,5 +1,8 @@
|
|||||||
# NOTE
|
# NOTE
|
||||||
|
|
||||||
|
While we do our best to keep the master branch in a usable state, there is no guarantee the master branch works.
|
||||||
|
Please do not use it for production!
|
||||||
|
|
||||||
Please have a look at the [Release
|
Please have a look at the [Release
|
||||||
Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md)
|
Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md)
|
||||||
for breaking changes!
|
for breaking changes!
|
||||||
@@ -19,19 +22,23 @@ switching from PHP Symfony to a Golang based solution are explained
|
|||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
This is a Golang web backend for the ClusterCockpit job-specific performance
|
This is a Golang web backend for the ClusterCockpit job-specific performance
|
||||||
monitoring framework. It provides a REST API for integrating ClusterCockpit with
|
monitoring framework. It provides a REST API and an optional NATS-based messaging
|
||||||
an HPC cluster batch system and external analysis scripts. Data exchange between
|
API for integrating ClusterCockpit with an HPC cluster batch system and external
|
||||||
the web front-end and the back-end is based on a GraphQL API. The web frontend
|
analysis scripts. Data exchange between the web front-end and the back-end is
|
||||||
is also served by the backend using [Svelte](https://svelte.dev/) components.
|
based on a GraphQL API. The web frontend is also served by the backend using
|
||||||
Layout and styling are based on [Bootstrap 5](https://getbootstrap.com/) using
|
[Svelte](https://svelte.dev/) components. Layout and styling are based on
|
||||||
|
[Bootstrap 5](https://getbootstrap.com/) using
|
||||||
[Bootstrap Icons](https://icons.getbootstrap.com/).
|
[Bootstrap Icons](https://icons.getbootstrap.com/).
|
||||||
|
|
||||||
The backend uses [SQLite 3](https://sqlite.org/) as a relational SQL database by
|
The backend uses [SQLite 3](https://sqlite.org/) as the relational SQL database.
|
||||||
default. Optionally it can use a MySQL/MariaDB database server. While there are
|
While there are metric data backends for the InfluxDB and Prometheus time series
|
||||||
metric data backends for the InfluxDB and Prometheus time series databases, the
|
databases, the only tested and supported setup is to use cc-metric-store as the
|
||||||
only tested and supported setup is to use cc-metric-store as the metric data
|
metric data backend. Documentation on how to integrate ClusterCockpit with other
|
||||||
backend. Documentation on how to integrate ClusterCockpit with other time series
|
time series databases will be added in the future.
|
||||||
databases will be added in the future.
|
|
||||||
|
For real-time integration with HPC systems, the backend can subscribe to
|
||||||
|
[NATS](https://nats.io/) subjects to receive job start/stop events and node
|
||||||
|
state updates, providing an alternative to REST API polling.
|
||||||
|
|
||||||
Completed batch jobs are stored in a file-based job archive according to
|
Completed batch jobs are stored in a file-based job archive according to
|
||||||
[this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
|
[this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
|
||||||
@@ -65,11 +72,11 @@ cd ./cc-backend
|
|||||||
./startDemo.sh
|
./startDemo.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also try the demo using the lates release binary.
|
You can also try the demo using the latest release binary.
|
||||||
Create a folder and put the release binary `cc-backend` into this folder.
|
Create a folder and put the release binary `cc-backend` into this folder.
|
||||||
Execute the following steps:
|
Execute the following steps:
|
||||||
|
|
||||||
``` shell
|
```shell
|
||||||
./cc-backend -init
|
./cc-backend -init
|
||||||
vim config.json (Add a second cluster entry and name the clusters alex and fritz)
|
vim config.json (Add a second cluster entry and name the clusters alex and fritz)
|
||||||
wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar
|
wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar
|
||||||
@@ -88,9 +95,11 @@ Analysis, Systems and Status views).
|
|||||||
There is a Makefile to automate the build of cc-backend. The Makefile supports
|
There is a Makefile to automate the build of cc-backend. The Makefile supports
|
||||||
the following targets:
|
the following targets:
|
||||||
|
|
||||||
* `make`: Initialize `var` directory and build svelte frontend and backend binary. Note that there is no proper prerequesite handling. Any change of frontend source files will result in a complete rebuild.
|
- `make`: Initialize `var` directory and build svelte frontend and backend
|
||||||
* `make clean`: Clean go build cache and remove binary.
|
binary. Note that there is no proper prerequisite handling. Any change of
|
||||||
* `make test`: Run the tests that are also run in the GitHub workflow setup.
|
frontend source files will result in a complete rebuild.
|
||||||
|
- `make clean`: Clean go build cache and remove binary.
|
||||||
|
- `make test`: Run the tests that are also run in the GitHub workflow setup.
|
||||||
|
|
||||||
A common workflow for setting up cc-backend from scratch is:
|
A common workflow for setting up cc-backend from scratch is:
|
||||||
|
|
||||||
@@ -124,45 +133,161 @@ ln -s <your-existing-job-archive> ./var/job-archive
|
|||||||
./cc-backend -help
|
./cc-backend -help
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Database Configuration
|
||||||
|
|
||||||
|
cc-backend uses SQLite as its database. For large installations, SQLite memory
|
||||||
|
usage can be tuned via the optional `db-config` section in config.json under
|
||||||
|
`main`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"main": {
|
||||||
|
"db": "./var/job.db",
|
||||||
|
"db-config": {
|
||||||
|
"cache-size-mb": 2048,
|
||||||
|
"soft-heap-limit-mb": 16384,
|
||||||
|
"max-open-connections": 4,
|
||||||
|
"max-idle-connections": 4,
|
||||||
|
"max-idle-time-minutes": 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
All fields are optional. If `db-config` is omitted entirely, built-in defaults
|
||||||
|
are used.
|
||||||
|
|
||||||
|
### Options
|
||||||
|
|
||||||
|
| Option | Default | Description |
|
||||||
|
| ----------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `cache-size-mb` | 2048 | SQLite page cache size per connection in MB. Maps to `PRAGMA cache_size`. Total cache memory is up to `cache-size-mb × max-open-connections`. |
|
||||||
|
| `soft-heap-limit-mb` | 16384 | Process-wide SQLite soft heap limit in MB. SQLite will try to release cache pages to stay under this limit. Queries won't fail if exceeded, but cache eviction becomes more aggressive. |
|
||||||
|
| `max-open-connections` | 4 | Maximum number of open database connections. |
|
||||||
|
| `max-idle-connections` | 4 | Maximum number of idle database connections kept in the pool. |
|
||||||
|
| `max-idle-time-minutes` | 10 | Maximum time in minutes a connection can sit idle before being closed. |
|
||||||
|
|
||||||
|
### Sizing Guidelines
|
||||||
|
|
||||||
|
SQLite's `cache_size` is a **per-connection** setting — each connection
|
||||||
|
maintains its own independent page cache. With multiple connections, the total
|
||||||
|
memory available for caching is the sum across all connections.
|
||||||
|
|
||||||
|
In practice, different connections tend to cache **different pages** (e.g., one
|
||||||
|
handles a job listing query while another runs a statistics aggregation), so
|
||||||
|
their caches naturally spread across the database. The formula
|
||||||
|
`DB_size / max-open-connections` gives enough per-connection cache that the
|
||||||
|
combined caches can cover the entire database.
|
||||||
|
|
||||||
|
However, this is a best-case estimate. Connections running similar queries will
|
||||||
|
cache the same pages redundantly. In the worst case (all connections caching
|
||||||
|
identical pages), only `cache-size-mb` worth of unique data is cached rather
|
||||||
|
than `cache-size-mb × max-open-connections`. For workloads with diverse
|
||||||
|
concurrent queries, cache overlap is typically low.
|
||||||
|
|
||||||
|
**Rules of thumb:**
|
||||||
|
|
||||||
|
- **cache-size-mb**: Set to `DB_size_in_MB / max-open-connections` to allow the
|
||||||
|
entire database to be cached in memory. For example, an 80GB database with 8
|
||||||
|
connections needs at least 10240 MB (10GB) per connection. If your workload
|
||||||
|
has many similar concurrent queries, consider setting it higher to account for
|
||||||
|
cache overlap between connections.
|
||||||
|
|
||||||
|
- **soft-heap-limit-mb**: Should be >= `cache-size-mb × max-open-connections` to
|
||||||
|
avoid cache thrashing. This is the total SQLite memory budget for the process.
|
||||||
|
- On small installations the defaults work well. On servers with large databases
|
||||||
|
(tens of GB) and plenty of RAM, increasing these values significantly improves
|
||||||
|
query performance by reducing disk I/O.
|
||||||
|
|
||||||
|
### Example: Large Server (512GB RAM, 80GB database)
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"main": {
|
||||||
|
"db-config": {
|
||||||
|
"cache-size-mb": 16384,
|
||||||
|
"soft-heap-limit-mb": 131072,
|
||||||
|
"max-open-connections": 8,
|
||||||
|
"max-idle-time-minutes": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This allows the entire 80GB database to be cached (8 × 16GB = 128GB page cache)
|
||||||
|
with a 128GB soft heap limit, using about 25% of available RAM.
|
||||||
|
|
||||||
|
The effective configuration is logged at startup for verification.
|
||||||
|
|
||||||
## Project file structure
|
## Project file structure
|
||||||
|
|
||||||
* [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
|
- [`.github/`](https://github.com/ClusterCockpit/cc-backend/tree/master/.github)
|
||||||
contains the API schema files for the REST and GraphQL APIs. The REST API is
|
GitHub Actions workflows and dependabot configuration for CI/CD.
|
||||||
documented in the OpenAPI 3.0 format in
|
- [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
|
||||||
[./api/openapi.yaml](./api/openapi.yaml).
|
contains the API schema files for the REST and GraphQL APIs. The REST API is
|
||||||
* [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
|
documented in the OpenAPI 3.0 format in
|
||||||
contains `main.go` for the main application.
|
[./api/swagger.yaml](./api/swagger.yaml). The GraphQL schema is in
|
||||||
* [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
|
[./api/schema.graphqls](./api/schema.graphqls).
|
||||||
contains documentation about configuration and command line options and required
|
- [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
|
||||||
environment variables. A sample configuration file is provided.
|
contains the main application entry point and CLI implementation.
|
||||||
* [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs)
|
- [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
|
||||||
contains more in-depth documentation.
|
contains documentation about configuration and command line options and required
|
||||||
* [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
|
environment variables. Sample configuration files are provided.
|
||||||
contains an example of setting up systemd for production use.
|
- [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
|
||||||
* [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
|
contains an example of setting up systemd for production use.
|
||||||
contains library source code that is not intended for use by others.
|
- [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
|
||||||
* [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
|
contains library source code that is not intended for use by others.
|
||||||
contains Go packages that can be used by other projects.
|
- [`api`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/api)
|
||||||
* [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
|
REST API handlers and NATS integration
|
||||||
Additional command line helper tools.
|
- [`archiver`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/archiver)
|
||||||
* [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
|
Job archiving functionality
|
||||||
Commands for getting infos about and existing job archive.
|
- [`auth`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/auth)
|
||||||
* [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
|
Authentication (local, LDAP, OIDC) and JWT token handling
|
||||||
Tool to migrate from previous to current job archive version.
|
- [`config`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/config)
|
||||||
* [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
|
Configuration management and validation
|
||||||
Tool to convert external pubkey for use in `cc-backend`.
|
- [`graph`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/graph)
|
||||||
* [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
|
GraphQL schema and resolvers
|
||||||
contains a small application to generate a compatible JWT keypair. You find
|
- [`importer`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/importer)
|
||||||
documentation on how to use it
|
Job data import and database initialization
|
||||||
[here](https://github.com/ClusterCockpit/cc-backend/blob/master/docs/JWT-Handling.md).
|
- [`metricdispatch`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricdispatch)
|
||||||
* [`web/`](https://github.com/ClusterCockpit/cc-backend/tree/master/web)
|
Dispatches metric data loading to appropriate backends
|
||||||
Server-side templates and frontend-related files:
|
- [`repository`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/repository)
|
||||||
* [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
|
Database repository layer for jobs and metadata
|
||||||
Svelte components and static assets for the frontend UI
|
- [`routerConfig`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/routerConfig)
|
||||||
* [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
|
HTTP router configuration and middleware
|
||||||
Server-side Go templates
|
- [`tagger`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/tagger)
|
||||||
* [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
|
Job classification and application detection
|
||||||
Configures the behaviour and generation of
|
- [`taskmanager`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/taskmanager)
|
||||||
[gqlgen](https://github.com/99designs/gqlgen).
|
Background task management and scheduled jobs
|
||||||
* [`startDemo.sh`](https://github.com/ClusterCockpit/cc-backend/blob/master/startDemo.sh)
|
- [`metricstoreclient`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricstoreclient)
|
||||||
is a shell script that sets up demo data, and builds and starts `cc-backend`.
|
Client for cc-metric-store queries
|
||||||
|
- [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
|
||||||
|
contains Go packages that can be used by other projects.
|
||||||
|
- [`archive`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/archive)
|
||||||
|
Job archive backend implementations (filesystem, S3, SQLite)
|
||||||
|
- [`metricstore`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/metricstore)
|
||||||
|
In-memory metric data store with checkpointing and metric loading
|
||||||
|
- [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
|
||||||
|
Additional command line helper tools.
|
||||||
|
- [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
|
||||||
|
Commands for getting infos about an existing job archive, importing jobs
|
||||||
|
between archive backends, and converting archives between JSON and Parquet formats.
|
||||||
|
- [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
|
||||||
|
Tool for migrating job archives between formats.
|
||||||
|
- [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
|
||||||
|
Tool to convert external pubkey for use in `cc-backend`.
|
||||||
|
- [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
|
||||||
|
contains a small application to generate a compatible JWT keypair. You find
|
||||||
|
documentation on how to use it
|
||||||
|
[here](https://github.com/ClusterCockpit/cc-backend/blob/master/docs/JWT-Handling.md).
|
||||||
|
- [`web/`](https://github.com/ClusterCockpit/cc-backend/tree/master/web)
|
||||||
|
Server-side templates and frontend-related files:
|
||||||
|
- [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
|
||||||
|
Svelte components and static assets for the frontend UI
|
||||||
|
- [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
|
||||||
|
Server-side Go templates, including monitoring views
|
||||||
|
- [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
|
||||||
|
Configures the behaviour and generation of
|
||||||
|
[gqlgen](https://github.com/99designs/gqlgen).
|
||||||
|
- [`startDemo.sh`](https://github.com/ClusterCockpit/cc-backend/blob/master/startDemo.sh)
|
||||||
|
is a shell script that sets up demo data, and builds and starts `cc-backend`.
|
||||||
|
|||||||
276
ReleaseNotes.md
276
ReleaseNotes.md
@@ -1,11 +1,279 @@
|
|||||||
# `cc-backend` version 1.3.1
|
# `cc-backend` version 1.5.0
|
||||||
|
|
||||||
Supports job archive version 1 and database version 7.
|
Supports job archive version 3 and database version 10.
|
||||||
|
|
||||||
This is a bugfix release of `cc-backend`, the API backend and frontend
|
This is a feature release of `cc-backend`, the API backend and frontend
|
||||||
implementation of ClusterCockpit.
|
implementation of ClusterCockpit.
|
||||||
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
|
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
|
||||||
|
|
||||||
## Breaking changes
|
## Breaking changes
|
||||||
|
|
||||||
None
|
### Configuration changes
|
||||||
|
|
||||||
|
- **JSON attribute naming**: All JSON configuration attributes now use `kebab-case`
|
||||||
|
style consistently (e.g., `api-allowed-ips` instead of `apiAllowedIPs`).
|
||||||
|
Update your `config.json` accordingly.
|
||||||
|
- **Removed `disable-archive` option**: This obsolete configuration option has been removed.
|
||||||
|
- **Removed `clusters` config section**: The separate clusters configuration section
|
||||||
|
has been removed. Cluster information is now derived from the job archive.
|
||||||
|
- **`apiAllowedIPs` is now optional**: If not specified, defaults to not
|
||||||
|
restricted.
|
||||||
|
|
||||||
|
### Architecture changes
|
||||||
|
|
||||||
|
- **Web framework replaced**: Migrated from `gorilla/mux` to `chi` as the HTTP
|
||||||
|
router. This should be transparent to users but affects how middleware and
|
||||||
|
routes are composed. A proper 404 handler is now in place.
|
||||||
|
- **MetricStore moved**: The `metricstore` package has been moved from `internal/`
|
||||||
|
to `pkg/` as it is now part of the public API.
|
||||||
|
- **MySQL/MariaDB support removed**: Only SQLite is now supported as the database backend.
|
||||||
|
- **Archive to Cleanup renaming**: Archive-related functions have been refactored
|
||||||
|
and renamed to "Cleanup" for clarity.
|
||||||
|
- **`minRunningFor` filter removed**: This undocumented filter has been removed
|
||||||
|
from the API and frontend.
|
||||||
|
|
||||||
|
### Dependency changes
|
||||||
|
|
||||||
|
- **cc-lib v2.5.1**: Switched to cc-lib version 2 with updated APIs (currently at v2.5.1)
|
||||||
|
- **cclib NATS client**: Now using the cclib NATS client implementation
|
||||||
|
- Removed obsolete `util.Float` usage from cclib
|
||||||
|
|
||||||
|
## Major new features
|
||||||
|
|
||||||
|
### NATS API Integration
|
||||||
|
|
||||||
|
- **Real-time job events**: Subscribe to job start/stop events via NATS
|
||||||
|
- **Node state updates**: Receive real-time node state changes via NATS
|
||||||
|
- **Configurable subjects**: NATS API subjects are now configurable via `api-subjects`
|
||||||
|
- **Deadlock fixes**: Improved NATS client stability and graceful shutdown
|
||||||
|
|
||||||
|
### Public Dashboard
|
||||||
|
|
||||||
|
- **Public-facing interface**: New public dashboard route for external users
|
||||||
|
- **DoubleMetricPlot component**: New visualization component for comparing metrics
|
||||||
|
- **Improved layout**: Reviewed and optimized dashboard layouts for better readability
|
||||||
|
|
||||||
|
### Enhanced Node Management
|
||||||
|
|
||||||
|
- **Node state tracking**: New node table in database with timestamp tracking
|
||||||
|
- **Node state filtering**: Filter jobs by node state in systems view
|
||||||
|
- **Node list enhancements**: Improved paging, filtering, and continuous scroll support
|
||||||
|
- **Nodestate retention and archiving**: Node state data is now subject to configurable
|
||||||
|
retention policies and can be archived to Parquet format for long-term storage
|
||||||
|
- **Faulty node metric tracking**: Faulty node state metric lists are persisted to the database
|
||||||
|
|
||||||
|
### Health Monitoring
|
||||||
|
|
||||||
|
- **Health status dashboard**: New dedicated "Health" tab in the status details view
|
||||||
|
showing per-node metric health across the cluster
|
||||||
|
- **CCMS health check**: Support for querying health status of external
|
||||||
|
cc-metric-store (CCMS) instances via the API
|
||||||
|
- **GraphQL health endpoints**: New GraphQL queries and resolvers for health data
|
||||||
|
- **Cluster/subcluster filter**: Filter health status view by cluster or subcluster
|
||||||
|
|
||||||
|
### Log Viewer
|
||||||
|
|
||||||
|
- **Web-based log viewer**: New log viewer page in the admin interface for inspecting
|
||||||
|
backend log output directly from the browser without shell access
|
||||||
|
- **Accessible from header**: Quick access link from the navigation header
|
||||||
|
|
||||||
|
### MetricStore Improvements
|
||||||
|
|
||||||
|
- **Memory tracking worker**: New worker for CCMS memory usage tracking
|
||||||
|
- **Dynamic retention**: Support for job specific dynamic retention times
|
||||||
|
- **Improved compression**: Transparent compression for job archive imports
|
||||||
|
- **Parallel processing**: Parallelized Iter function in all archive backends
|
||||||
|
|
||||||
|
### Job Tagging System
|
||||||
|
|
||||||
|
- **Job tagger option**: Enable automatic job tagging via configuration flag
|
||||||
|
- **Application detection**: Automatic detection of applications (MATLAB, GROMACS, etc.)
|
||||||
|
- **Job classification**: Automatic detection of pathological jobs
|
||||||
|
- **omit-tagged**: Option to exclude tagged jobs from retention/cleanup operations (`none`, `all`, or `user`)
|
||||||
|
- **Admin UI trigger**: Taggers can be run on-demand from the admin web interface
|
||||||
|
without restarting the backend
|
||||||
|
|
||||||
|
### Archive Backends
|
||||||
|
|
||||||
|
- **Parquet archive format**: New Parquet file format for job archiving, providing
|
||||||
|
columnar storage with efficient compression for analytical workloads
|
||||||
|
- **S3 backend**: Full support for S3-compatible object storage
|
||||||
|
- **SQLite backend**: Full support for SQLite backend using blobs
|
||||||
|
- **Performance improvements**: Fixed performance bugs in archive backends
|
||||||
|
- **Better error handling**: Improved error messages and fallback handling
|
||||||
|
- **Zstd compression**: Parquet writers use zstd compression for better
|
||||||
|
compression ratios compared to the previous snappy default
|
||||||
|
- **Optimized sort order**: Job and nodestate Parquet files are sorted by
|
||||||
|
cluster, subcluster, and start time for efficient range queries
|
||||||
|
|
||||||
|
### Unified Archive Retention and Format Conversion
|
||||||
|
|
||||||
|
- **Uniform retention policy**: Job archive retention now supports both JSON and
|
||||||
|
Parquet as target formats under a single, consistent policy configuration
|
||||||
|
- **Archive manager tool**: The `tools/archive-manager` utility now supports
|
||||||
|
format conversion between JSON and Parquet job archives
|
||||||
|
- **Parquet reader**: Full Parquet archive reader implementation for reading back
|
||||||
|
archived job data
|
||||||
|
|
||||||
|
## New features and improvements
|
||||||
|
|
||||||
|
### Frontend
|
||||||
|
|
||||||
|
- **Loading indicators**: Added loading indicators to status detail and job lists
|
||||||
|
- **Job info layout**: Reviewed and improved job info row layout
|
||||||
|
- **Metric selection**: Enhanced metric selection with drag-and-drop fixes
|
||||||
|
- **Filter presets**: Move list filter preset to URL for easy sharing
|
||||||
|
- **Job comparison**: Improved job comparison views and plots
|
||||||
|
- **Subcluster reactivity**: Job list now reacts to subcluster filter changes
|
||||||
|
- **Short jobs quick selection**: New "Short jobs" quick-filter button in job lists
|
||||||
|
replaces the removed undocumented `minRunningFor` filter
|
||||||
|
- **Row plot cursor sync**: Cursor position is now synchronized across all metric
|
||||||
|
plots in a job list row for easier cross-metric comparison
|
||||||
|
- **Disabled metrics handling**: Improved handling and display of disabled metrics
|
||||||
|
across job view, node view, and list rows
|
||||||
|
- **"Not configured" info cards**: Informational cards shown when optional features
|
||||||
|
are not yet configured
|
||||||
|
- **Frontend dependencies**: Bumped frontend dependencies to latest versions
|
||||||
|
- **Svelte 5 compatibility**: Fixed Svelte state warnings and compatibility issues
|
||||||
|
|
||||||
|
### Backend
|
||||||
|
|
||||||
|
- **Progress bars**: Import function now shows progress during long operations
|
||||||
|
- **Better logging**: Improved logging with appropriate log levels throughout
|
||||||
|
- **Graceful shutdown**: Fixed shutdown timeout bugs and hanging issues
|
||||||
|
- **Configuration defaults**: Sensible defaults for most configuration options
|
||||||
|
- **Documentation**: Extensive documentation improvements across packages
|
||||||
|
- **Server flag in systemd unit**: Example systemd unit now includes the `-server` flag
|
||||||
|
|
||||||
|
### Security
|
||||||
|
|
||||||
|
- **LDAP security hardening**: Improved input validation, connection handling, and
|
||||||
|
error reporting in the LDAP authenticator
|
||||||
|
- **OIDC security hardening**: Stricter token validation and improved error handling
|
||||||
|
in the OIDC authenticator
|
||||||
|
- **Auth schema extensions**: Additional schema fields for improved auth configuration
|
||||||
|
|
||||||
|
### API improvements
|
||||||
|
|
||||||
|
- **Role-based metric visibility**: Metrics can now have role-based access control
|
||||||
|
- **Job exclusivity filter**: New filter for exclusive vs. shared jobs
|
||||||
|
- **Improved error messages**: Better error messages and documentation in REST API
|
||||||
|
- **GraphQL enhancements**: Improved GraphQL queries and resolvers
|
||||||
|
- **Stop job lookup order**: Reversed lookup order in stop job requests for
|
||||||
|
more reliable job matching (cluster+jobId first, then jobId alone)
|
||||||
|
|
||||||
|
### Performance
|
||||||
|
|
||||||
|
- **Database indices**: Optimized SQLite indices for better query performance
|
||||||
|
- **Job cache**: Introduced caching table for faster job inserts
|
||||||
|
- **Parallel imports**: Archive imports now run in parallel where possible
|
||||||
|
- **External tool integration**: Optimized use of external tools (fd) for better performance
|
||||||
|
- **Node repository queries**: Reviewed and optimized node repository SQL queries
|
||||||
|
- **Buffer pool**: Resized and pooled internal buffers for better memory reuse
|
||||||
|
|
||||||
|
### Developer experience
|
||||||
|
|
||||||
|
- **AI agent guidelines**: Added documentation for AI coding agents (AGENTS.md, CLAUDE.md)
|
||||||
|
- **Example API payloads**: Added example JSON API payloads for testing
|
||||||
|
- **Unit tests**: Added more unit tests for NATS API, node repository, and other components
|
||||||
|
- **Test improvements**: Better test coverage; test DB is now copied before unit tests
|
||||||
|
to avoid state pollution between test runs
|
||||||
|
- **Parquet writer tests**: Comprehensive tests for Parquet archive writing and conversion
|
||||||
|
|
||||||
|
## Bug fixes
|
||||||
|
|
||||||
|
- Fixed nodelist paging issues
|
||||||
|
- Fixed metric select drag and drop functionality
|
||||||
|
- Fixed render race conditions in nodeList
|
||||||
|
- Fixed tag count grouping including type
|
||||||
|
- Fixed wrong metricstore schema (missing comma)
|
||||||
|
- Fixed configuration issues causing shutdown hangs
|
||||||
|
- Fixed deadlock when NATS is not configured
|
||||||
|
- Fixed archive backend performance bugs
|
||||||
|
- Fixed continuous scroll buildup on refresh
|
||||||
|
- Improved footprint calculation logic
|
||||||
|
- Fixed polar plot data query decoupling
|
||||||
|
- Fixed missing resolution parameter handling
|
||||||
|
- Fixed node table initialization fallback
|
||||||
|
- Fixed reactivity key placement in nodeList
|
||||||
|
- Fixed nodeList resolver data handling and increased nodestate filter cutoff
|
||||||
|
- Fixed job always being transferred to main job table before archiving
|
||||||
|
- Fixed AppTagger error handling and logging
|
||||||
|
- Fixed log endpoint formatting and correctness
|
||||||
|
- Fixed automatic refresh in metric status tab
|
||||||
|
- Fixed NULL value handling in `health_state` and `health_metrics` columns
|
||||||
|
- Fixed bugs related to `job_cache` IDs being used in the main job table
|
||||||
|
- Fixed SyncJobs bug causing start job hooks to be called with wrong (cache) IDs
|
||||||
|
- Fixed 404 handler route for sub-routers
|
||||||
|
|
||||||
|
## Configuration changes
|
||||||
|
|
||||||
|
### New configuration options
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"main": {
|
||||||
|
"enable-job-taggers": true,
|
||||||
|
"resampling": {
|
||||||
|
"minimum-points": 600,
|
||||||
|
"trigger": 180,
|
||||||
|
"resolutions": [240, 60]
|
||||||
|
},
|
||||||
|
"api-subjects": {
|
||||||
|
"subject-job-event": "cc.job.event",
|
||||||
|
"subject-node-state": "cc.node.state"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nats": {
|
||||||
|
"address": "nats://0.0.0.0:4222",
|
||||||
|
"username": "root",
|
||||||
|
"password": "root"
|
||||||
|
},
|
||||||
|
"cron": {
|
||||||
|
"commit-job-worker": "1m",
|
||||||
|
"duration-worker": "5m",
|
||||||
|
"footprint-worker": "10m"
|
||||||
|
},
|
||||||
|
"metric-store": {
|
||||||
|
"cleanup": {
|
||||||
|
"mode": "archive",
|
||||||
|
"interval": "48h",
|
||||||
|
"directory": "./var/archive"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"archive": {
|
||||||
|
"retention": {
|
||||||
|
"policy": "delete",
|
||||||
|
"age": "6months",
|
||||||
|
"target-format": "parquet"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nodestate": {
|
||||||
|
"retention": {
|
||||||
|
"policy": "archive",
|
||||||
|
"age": "30d",
|
||||||
|
"archive-path": "./var/nodestate-archive"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Migration notes
|
||||||
|
|
||||||
|
- Review and update your `config.json` to use kebab-case attribute names
|
||||||
|
- If using NATS, configure the new `nats` and `api-subjects` sections
|
||||||
|
- If using S3 archive backend, configure the new `archive` section options
|
||||||
|
- Test the new public dashboard at `/public` route
|
||||||
|
- Review cron worker configuration if you need different frequencies
|
||||||
|
- If using the archive retention feature, configure the `target-format` option
|
||||||
|
to choose between `json` (default) and `parquet` output formats
|
||||||
|
- Consider enabling nodestate retention if you track node states over time
|
||||||
|
|
||||||
|
## Known issues
|
||||||
|
|
||||||
|
- The new dynamic memory management is not bullet proof yet across restarts. We
|
||||||
|
will fix that in a subsequent patch release
|
||||||
|
- Currently energy footprint metrics of type energy are ignored for calculating
|
||||||
|
total energy.
|
||||||
|
- With energy footprint metrics of type power the unit is ignored and it is
|
||||||
|
assumed the metric has the unit Watt.
|
||||||
|
|||||||
@@ -4,138 +4,223 @@ scalar Any
|
|||||||
scalar NullableFloat
|
scalar NullableFloat
|
||||||
scalar MetricScope
|
scalar MetricScope
|
||||||
scalar JobState
|
scalar JobState
|
||||||
|
scalar SchedulerState
|
||||||
|
scalar MonitoringState
|
||||||
|
|
||||||
|
type Node {
|
||||||
|
id: ID!
|
||||||
|
hostname: String!
|
||||||
|
cluster: String!
|
||||||
|
subCluster: String!
|
||||||
|
jobsRunning: Int!
|
||||||
|
cpusAllocated: Int
|
||||||
|
memoryAllocated: Int
|
||||||
|
gpusAllocated: Int
|
||||||
|
schedulerState: SchedulerState!
|
||||||
|
healthState: MonitoringState!
|
||||||
|
metaData: Any
|
||||||
|
healthData: Any
|
||||||
|
}
|
||||||
|
|
||||||
|
type NodeStates {
|
||||||
|
state: String!
|
||||||
|
count: Int!
|
||||||
|
}
|
||||||
|
|
||||||
|
type NodeStatesTimed {
|
||||||
|
state: String!
|
||||||
|
counts: [Int!]!
|
||||||
|
times: [Int!]!
|
||||||
|
}
|
||||||
|
|
||||||
type Job {
|
type Job {
|
||||||
id: ID!
|
id: ID!
|
||||||
jobId: Int!
|
jobId: Int!
|
||||||
user: String!
|
user: String!
|
||||||
project: String!
|
project: String!
|
||||||
cluster: String!
|
cluster: String!
|
||||||
subCluster: String!
|
subCluster: String!
|
||||||
startTime: Time!
|
startTime: Time!
|
||||||
duration: Int!
|
duration: Int!
|
||||||
walltime: Int!
|
walltime: Int!
|
||||||
numNodes: Int!
|
numNodes: Int!
|
||||||
numHWThreads: Int!
|
numHWThreads: Int!
|
||||||
numAcc: Int!
|
numAcc: Int!
|
||||||
SMT: Int!
|
energy: Float!
|
||||||
exclusive: Int!
|
SMT: Int!
|
||||||
partition: String!
|
shared: String!
|
||||||
arrayJobId: Int!
|
partition: String!
|
||||||
|
arrayJobId: Int!
|
||||||
monitoringStatus: Int!
|
monitoringStatus: Int!
|
||||||
state: JobState!
|
state: JobState!
|
||||||
tags: [Tag!]!
|
tags: [Tag!]!
|
||||||
resources: [Resource!]!
|
resources: [Resource!]!
|
||||||
concurrentJobs: JobLinkResultList
|
concurrentJobs: JobLinkResultList
|
||||||
|
footprint: [FootprintValue]
|
||||||
memUsedMax: Float
|
energyFootprint: [EnergyFootprintValue]
|
||||||
flopsAnyAvg: Float
|
metaData: Any
|
||||||
memBwAvg: Float
|
userData: User
|
||||||
loadAvg: Float
|
|
||||||
|
|
||||||
metaData: Any
|
|
||||||
userData: User
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobLink {
|
type JobLink {
|
||||||
id: ID!
|
id: ID!
|
||||||
jobId: Int!
|
jobId: Int!
|
||||||
}
|
}
|
||||||
|
|
||||||
type Cluster {
|
type Cluster {
|
||||||
name: String!
|
name: String!
|
||||||
partitions: [String!]! # Slurm partitions
|
partitions: [String!]! # Slurm partitions
|
||||||
metricConfig: [MetricConfig!]!
|
subClusters: [SubCluster!]! # Hardware partitions/subclusters
|
||||||
subClusters: [SubCluster!]! # Hardware partitions/subclusters
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type SubCluster {
|
type SubCluster {
|
||||||
name: String!
|
name: String!
|
||||||
nodes: String!
|
nodes: String!
|
||||||
numberOfNodes: Int!
|
numberOfNodes: Int!
|
||||||
processorType: String!
|
processorType: String!
|
||||||
socketsPerNode: Int!
|
socketsPerNode: Int!
|
||||||
coresPerSocket: Int!
|
coresPerSocket: Int!
|
||||||
threadsPerCore: Int!
|
threadsPerCore: Int!
|
||||||
flopRateScalar: MetricValue!
|
flopRateScalar: MetricValue!
|
||||||
flopRateSimd: MetricValue!
|
flopRateSimd: MetricValue!
|
||||||
memoryBandwidth: MetricValue!
|
memoryBandwidth: MetricValue!
|
||||||
topology: Topology!
|
topology: Topology!
|
||||||
|
metricConfig: [MetricConfig!]!
|
||||||
|
footprint: [String!]!
|
||||||
|
}
|
||||||
|
|
||||||
|
type FootprintValue {
|
||||||
|
name: String!
|
||||||
|
stat: String!
|
||||||
|
value: Float!
|
||||||
|
}
|
||||||
|
|
||||||
|
type EnergyFootprintValue {
|
||||||
|
hardware: String!
|
||||||
|
metric: String!
|
||||||
|
value: Float!
|
||||||
}
|
}
|
||||||
|
|
||||||
type MetricValue {
|
type MetricValue {
|
||||||
|
name: String
|
||||||
unit: Unit!
|
unit: Unit!
|
||||||
value: Float!
|
value: Float!
|
||||||
}
|
}
|
||||||
|
|
||||||
type Topology {
|
type Topology {
|
||||||
node: [Int!]
|
node: [Int!]
|
||||||
socket: [[Int!]!]
|
socket: [[Int!]!]
|
||||||
memoryDomain: [[Int!]!]
|
memoryDomain: [[Int!]!]
|
||||||
die: [[Int!]!]
|
die: [[Int!]!]
|
||||||
core: [[Int!]!]
|
core: [[Int!]!]
|
||||||
accelerators: [Accelerator!]
|
accelerators: [Accelerator!]
|
||||||
}
|
}
|
||||||
|
|
||||||
type Accelerator {
|
type Accelerator {
|
||||||
id: String!
|
id: String!
|
||||||
type: String!
|
type: String!
|
||||||
model: String!
|
model: String!
|
||||||
}
|
}
|
||||||
|
|
||||||
type SubClusterConfig {
|
type SubClusterConfig {
|
||||||
name: String!
|
name: String!
|
||||||
peak: Float
|
peak: Float
|
||||||
normal: Float
|
normal: Float
|
||||||
caution: Float
|
caution: Float
|
||||||
alert: Float
|
alert: Float
|
||||||
remove: Boolean
|
remove: Boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
type MetricConfig {
|
type MetricConfig {
|
||||||
name: String!
|
name: String!
|
||||||
unit: Unit!
|
unit: Unit!
|
||||||
scope: MetricScope!
|
scope: MetricScope!
|
||||||
aggregation: String!
|
aggregation: String!
|
||||||
timestep: Int!
|
timestep: Int!
|
||||||
peak: Float!
|
peak: Float!
|
||||||
normal: Float
|
normal: Float
|
||||||
caution: Float!
|
caution: Float!
|
||||||
alert: Float!
|
alert: Float!
|
||||||
|
lowerIsBetter: Boolean
|
||||||
subClusters: [SubClusterConfig!]!
|
subClusters: [SubClusterConfig!]!
|
||||||
}
|
}
|
||||||
|
|
||||||
type Tag {
|
type Tag {
|
||||||
id: ID!
|
id: ID!
|
||||||
type: String!
|
type: String!
|
||||||
name: String!
|
name: String!
|
||||||
|
scope: String!
|
||||||
}
|
}
|
||||||
|
|
||||||
type Resource {
|
type Resource {
|
||||||
hostname: String!
|
hostname: String!
|
||||||
hwthreads: [Int!]
|
hwthreads: [Int!]
|
||||||
accelerators: [String!]
|
accelerators: [String!]
|
||||||
configuration: String
|
configuration: String
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobMetricWithName {
|
type JobMetricWithName {
|
||||||
name: String!
|
name: String!
|
||||||
scope: MetricScope!
|
scope: MetricScope!
|
||||||
metric: JobMetric!
|
metric: JobMetric!
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ClusterMetricWithName {
|
||||||
|
name: String!
|
||||||
|
unit: Unit
|
||||||
|
timestep: Int!
|
||||||
|
data: [NullableFloat!]!
|
||||||
|
}
|
||||||
|
|
||||||
type JobMetric {
|
type JobMetric {
|
||||||
unit: Unit
|
unit: Unit
|
||||||
timestep: Int!
|
timestep: Int!
|
||||||
series: [Series!]
|
series: [Series!]
|
||||||
statisticsSeries: StatsSeries
|
statisticsSeries: StatsSeries
|
||||||
}
|
}
|
||||||
|
|
||||||
type Series {
|
type Series {
|
||||||
hostname: String!
|
hostname: String!
|
||||||
id: String
|
id: String
|
||||||
statistics: MetricStatistics
|
statistics: MetricStatistics
|
||||||
data: [NullableFloat!]!
|
data: [NullableFloat!]!
|
||||||
|
}
|
||||||
|
|
||||||
|
type StatsSeries {
|
||||||
|
mean: [NullableFloat!]!
|
||||||
|
median: [NullableFloat!]!
|
||||||
|
min: [NullableFloat!]!
|
||||||
|
max: [NullableFloat!]!
|
||||||
|
}
|
||||||
|
|
||||||
|
type NamedStatsWithScope {
|
||||||
|
name: String!
|
||||||
|
scope: MetricScope!
|
||||||
|
stats: [ScopedStats!]!
|
||||||
|
}
|
||||||
|
|
||||||
|
type ScopedStats {
|
||||||
|
hostname: String!
|
||||||
|
id: String
|
||||||
|
data: MetricStatistics!
|
||||||
|
}
|
||||||
|
|
||||||
|
type JobStats {
|
||||||
|
id: Int!
|
||||||
|
jobId: String!
|
||||||
|
startTime: Int!
|
||||||
|
duration: Int!
|
||||||
|
cluster: String!
|
||||||
|
subCluster: String!
|
||||||
|
numNodes: Int!
|
||||||
|
numHWThreads: Int
|
||||||
|
numAccelerators: Int
|
||||||
|
stats: [NamedStats!]!
|
||||||
|
}
|
||||||
|
|
||||||
|
type NamedStats {
|
||||||
|
name: String!
|
||||||
|
data: MetricStatistics!
|
||||||
}
|
}
|
||||||
|
|
||||||
type Unit {
|
type Unit {
|
||||||
@@ -149,20 +234,14 @@ type MetricStatistics {
|
|||||||
max: Float!
|
max: Float!
|
||||||
}
|
}
|
||||||
|
|
||||||
type StatsSeries {
|
|
||||||
mean: [NullableFloat!]!
|
|
||||||
min: [NullableFloat!]!
|
|
||||||
max: [NullableFloat!]!
|
|
||||||
}
|
|
||||||
|
|
||||||
type MetricFootprints {
|
type MetricFootprints {
|
||||||
metric: String!
|
metric: String!
|
||||||
data: [NullableFloat!]!
|
data: [NullableFloat!]!
|
||||||
}
|
}
|
||||||
|
|
||||||
type Footprints {
|
type Footprints {
|
||||||
timeWeights: TimeWeights!
|
timeWeights: TimeWeights!
|
||||||
metrics: [MetricFootprints!]!
|
metrics: [MetricFootprints!]!
|
||||||
}
|
}
|
||||||
|
|
||||||
type TimeWeights {
|
type TimeWeights {
|
||||||
@@ -171,87 +250,224 @@ type TimeWeights {
|
|||||||
coreHours: [NullableFloat!]!
|
coreHours: [NullableFloat!]!
|
||||||
}
|
}
|
||||||
|
|
||||||
enum Aggregate { USER, PROJECT, CLUSTER }
|
enum Aggregate {
|
||||||
enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS }
|
USER
|
||||||
|
PROJECT
|
||||||
|
CLUSTER
|
||||||
|
SUBCLUSTER
|
||||||
|
}
|
||||||
|
enum SortByAggregate {
|
||||||
|
TOTALWALLTIME
|
||||||
|
TOTALJOBS
|
||||||
|
TOTALUSERS
|
||||||
|
TOTALNODES
|
||||||
|
TOTALNODEHOURS
|
||||||
|
TOTALCORES
|
||||||
|
TOTALCOREHOURS
|
||||||
|
TOTALACCS
|
||||||
|
TOTALACCHOURS
|
||||||
|
}
|
||||||
|
|
||||||
type NodeMetrics {
|
type NodeMetrics {
|
||||||
host: String!
|
host: String!
|
||||||
|
state: String!
|
||||||
subCluster: String!
|
subCluster: String!
|
||||||
metrics: [JobMetricWithName!]!
|
metrics: [JobMetricWithName!]!
|
||||||
|
}
|
||||||
|
|
||||||
|
type ClusterMetrics {
|
||||||
|
nodeCount: Int!
|
||||||
|
metrics: [ClusterMetricWithName!]!
|
||||||
|
}
|
||||||
|
|
||||||
|
type NodesResultList {
|
||||||
|
items: [NodeMetrics!]!
|
||||||
|
offset: Int
|
||||||
|
limit: Int
|
||||||
|
count: Int
|
||||||
|
totalNodes: Int
|
||||||
|
hasNextPage: Boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
type ClusterSupport {
|
||||||
|
cluster: String!
|
||||||
|
subClusters: [String!]!
|
||||||
|
}
|
||||||
|
|
||||||
|
type GlobalMetricListItem {
|
||||||
|
name: String!
|
||||||
|
unit: Unit!
|
||||||
|
scope: MetricScope!
|
||||||
|
footprint: String
|
||||||
|
availability: [ClusterSupport!]!
|
||||||
}
|
}
|
||||||
|
|
||||||
type Count {
|
type Count {
|
||||||
name: String!
|
name: String!
|
||||||
count: Int!
|
count: Int!
|
||||||
}
|
}
|
||||||
|
|
||||||
type User {
|
type User {
|
||||||
username: String!
|
username: String!
|
||||||
name: String!
|
name: String!
|
||||||
email: String!
|
email: String!
|
||||||
|
}
|
||||||
|
|
||||||
|
input MetricStatItem {
|
||||||
|
metricName: String!
|
||||||
|
range: FloatRange!
|
||||||
}
|
}
|
||||||
|
|
||||||
type Query {
|
type Query {
|
||||||
clusters: [Cluster!]! # List of all clusters
|
clusters: [Cluster!]! # List of all clusters
|
||||||
tags: [Tag!]! # List of all tags
|
tags: [Tag!]! # List of all tags
|
||||||
|
globalMetrics: [GlobalMetricListItem!]!
|
||||||
|
|
||||||
user(username: String!): User
|
user(username: String!): User
|
||||||
allocatedNodes(cluster: String!): [Count!]!
|
allocatedNodes(cluster: String!): [Count!]!
|
||||||
|
|
||||||
|
## Node Queries New
|
||||||
|
node(id: ID!): Node
|
||||||
|
nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
|
||||||
|
nodesWithMeta(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
|
||||||
|
nodeStates(filter: [NodeFilter!]): [NodeStates!]!
|
||||||
|
nodeStatesTimed(filter: [NodeFilter!], type: String!): [NodeStatesTimed!]!
|
||||||
|
|
||||||
job(id: ID!): Job
|
job(id: ID!): Job
|
||||||
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
|
jobMetrics(
|
||||||
|
id: ID!
|
||||||
|
metrics: [String!]
|
||||||
|
scopes: [MetricScope!]
|
||||||
|
resolution: Int
|
||||||
|
): [JobMetricWithName!]!
|
||||||
|
|
||||||
|
jobStats(id: ID!, metrics: [String!]): [NamedStats!]!
|
||||||
|
|
||||||
|
scopedJobStats(
|
||||||
|
id: ID!
|
||||||
|
metrics: [String!]
|
||||||
|
scopes: [MetricScope!]
|
||||||
|
): [NamedStatsWithScope!]!
|
||||||
|
|
||||||
|
jobs(
|
||||||
|
filter: [JobFilter!]
|
||||||
|
page: PageRequest
|
||||||
|
order: OrderByInput
|
||||||
|
): JobResultList!
|
||||||
|
|
||||||
|
jobsStatistics(
|
||||||
|
filter: [JobFilter!]
|
||||||
|
metrics: [String!]
|
||||||
|
page: PageRequest
|
||||||
|
sortBy: SortByAggregate
|
||||||
|
groupBy: Aggregate
|
||||||
|
numDurationBins: String
|
||||||
|
numMetricBins: Int
|
||||||
|
): [JobsStatistics!]!
|
||||||
|
|
||||||
|
jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]!
|
||||||
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
|
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
|
||||||
|
|
||||||
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
|
rooflineHeatmap(
|
||||||
jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate): [JobsStatistics!]!
|
filter: [JobFilter!]!
|
||||||
|
rows: Int!
|
||||||
|
cols: Int!
|
||||||
|
minX: Float!
|
||||||
|
minY: Float!
|
||||||
|
maxX: Float!
|
||||||
|
maxY: Float!
|
||||||
|
): [[Float!]!]!
|
||||||
|
|
||||||
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
|
nodeMetrics(
|
||||||
|
cluster: String!
|
||||||
|
nodes: [String!]
|
||||||
|
scopes: [MetricScope!]
|
||||||
|
metrics: [String!]
|
||||||
|
from: Time!
|
||||||
|
to: Time!
|
||||||
|
): [NodeMetrics!]!
|
||||||
|
|
||||||
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
|
nodeMetricsList(
|
||||||
|
cluster: String!
|
||||||
|
subCluster: String!
|
||||||
|
stateFilter: String!
|
||||||
|
nodeFilter: String!
|
||||||
|
scopes: [MetricScope!]
|
||||||
|
metrics: [String!]
|
||||||
|
from: Time!
|
||||||
|
to: Time!
|
||||||
|
page: PageRequest
|
||||||
|
resolution: Int
|
||||||
|
): NodesResultList!
|
||||||
|
|
||||||
|
clusterMetrics(
|
||||||
|
cluster: String!
|
||||||
|
metrics: [String!]
|
||||||
|
from: Time!
|
||||||
|
to: Time!
|
||||||
|
): ClusterMetrics!
|
||||||
}
|
}
|
||||||
|
|
||||||
type Mutation {
|
type Mutation {
|
||||||
createTag(type: String!, name: String!): Tag!
|
createTag(type: String!, name: String!, scope: String!): Tag!
|
||||||
deleteTag(id: ID!): ID!
|
deleteTag(id: ID!): ID!
|
||||||
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||||
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||||
|
removeTagFromList(tagIds: [ID!]!): [Int!]!
|
||||||
|
|
||||||
updateConfiguration(name: String!, value: String!): String
|
updateConfiguration(name: String!, value: String!): String
|
||||||
}
|
}
|
||||||
|
|
||||||
type IntRangeOutput { from: Int!, to: Int! }
|
type IntRangeOutput {
|
||||||
type TimeRangeOutput { from: Time!, to: Time! }
|
from: Int!
|
||||||
|
to: Int!
|
||||||
|
}
|
||||||
|
type TimeRangeOutput {
|
||||||
|
range: String
|
||||||
|
from: Time!
|
||||||
|
to: Time!
|
||||||
|
}
|
||||||
|
|
||||||
|
input NodeFilter {
|
||||||
|
hostname: StringInput
|
||||||
|
cluster: StringInput
|
||||||
|
subCluster: StringInput
|
||||||
|
schedulerState: SchedulerState
|
||||||
|
healthState: MonitoringState
|
||||||
|
timeStart: Int
|
||||||
|
}
|
||||||
|
|
||||||
input JobFilter {
|
input JobFilter {
|
||||||
tags: [ID!]
|
tags: [ID!]
|
||||||
jobId: StringInput
|
dbId: [ID!]
|
||||||
arrayJobId: Int
|
jobId: StringInput
|
||||||
user: StringInput
|
arrayJobId: Int
|
||||||
project: StringInput
|
user: StringInput
|
||||||
jobName: StringInput
|
project: StringInput
|
||||||
cluster: StringInput
|
jobName: StringInput
|
||||||
partition: StringInput
|
cluster: StringInput
|
||||||
duration: IntRange
|
subCluster: StringInput
|
||||||
|
partition: StringInput
|
||||||
|
duration: IntRange
|
||||||
|
energy: FloatRange
|
||||||
|
|
||||||
minRunningFor: Int
|
minRunningFor: Int
|
||||||
|
|
||||||
numNodes: IntRange
|
numNodes: IntRange
|
||||||
numAccelerators: IntRange
|
numAccelerators: IntRange
|
||||||
numHWThreads: IntRange
|
numHWThreads: IntRange
|
||||||
|
|
||||||
startTime: TimeRange
|
startTime: TimeRange
|
||||||
state: [JobState!]
|
state: [JobState!]
|
||||||
flopsAnyAvg: FloatRange
|
metricStats: [MetricStatItem!]
|
||||||
memBwAvg: FloatRange
|
shared: String
|
||||||
loadAvg: FloatRange
|
schedule: String
|
||||||
memUsedMax: FloatRange
|
node: StringInput
|
||||||
|
|
||||||
exclusive: Int
|
|
||||||
node: StringInput
|
|
||||||
}
|
}
|
||||||
|
|
||||||
input OrderByInput {
|
input OrderByInput {
|
||||||
field: String!
|
field: String!
|
||||||
|
type: String!
|
||||||
order: SortDirectionEnum! = ASC
|
order: SortDirectionEnum! = ASC
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -261,30 +477,46 @@ enum SortDirectionEnum {
|
|||||||
}
|
}
|
||||||
|
|
||||||
input StringInput {
|
input StringInput {
|
||||||
eq: String
|
eq: String
|
||||||
neq: String
|
neq: String
|
||||||
contains: String
|
contains: String
|
||||||
startsWith: String
|
startsWith: String
|
||||||
endsWith: String
|
endsWith: String
|
||||||
in: [String!]
|
in: [String!]
|
||||||
}
|
}
|
||||||
|
|
||||||
input IntRange { from: Int!, to: Int! }
|
input IntRange {
|
||||||
input FloatRange { from: Float!, to: Float! }
|
from: Int!
|
||||||
input TimeRange { from: Time, to: Time }
|
to: Int!
|
||||||
|
}
|
||||||
|
input TimeRange {
|
||||||
|
range: String
|
||||||
|
from: Time
|
||||||
|
to: Time
|
||||||
|
}
|
||||||
|
|
||||||
|
input FloatRange {
|
||||||
|
from: Float!
|
||||||
|
to: Float!
|
||||||
|
}
|
||||||
|
|
||||||
|
type NodeStateResultList {
|
||||||
|
items: [Node!]!
|
||||||
|
count: Int
|
||||||
|
}
|
||||||
|
|
||||||
type JobResultList {
|
type JobResultList {
|
||||||
items: [Job!]!
|
items: [Job!]!
|
||||||
offset: Int
|
offset: Int
|
||||||
limit: Int
|
limit: Int
|
||||||
count: Int
|
count: Int
|
||||||
hasNextPage: Boolean
|
hasNextPage: Boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobLinkResultList {
|
type JobLinkResultList {
|
||||||
listQuery: String
|
listQuery: String
|
||||||
items: [JobLink!]!
|
items: [JobLink!]!
|
||||||
count: Int
|
count: Int
|
||||||
}
|
}
|
||||||
|
|
||||||
type HistoPoint {
|
type HistoPoint {
|
||||||
@@ -295,6 +527,7 @@ type HistoPoint {
|
|||||||
type MetricHistoPoints {
|
type MetricHistoPoints {
|
||||||
metric: String!
|
metric: String!
|
||||||
unit: String!
|
unit: String!
|
||||||
|
stat: String
|
||||||
data: [MetricHistoPoint!]
|
data: [MetricHistoPoint!]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -305,27 +538,28 @@ type MetricHistoPoint {
|
|||||||
max: Int
|
max: Int
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobsStatistics {
|
type JobsStatistics {
|
||||||
id: ID! # If `groupBy` was used, ID of the user/project/cluster
|
id: ID! # If `groupBy` was used, ID of the user/project/cluster/subcluster
|
||||||
name: String! # if User-Statistics: Given Name of Account (ID) Owner
|
name: String! # if User-Statistics: Given Name of Account (ID) Owner
|
||||||
totalJobs: Int! # Number of jobs
|
totalUsers: Int! # if *not* User-Statistics: Number of active users (based on running jobs)
|
||||||
runningJobs: Int! # Number of running jobs
|
totalJobs: Int! # Number of jobs
|
||||||
shortJobs: Int! # Number of jobs with a duration of less than duration
|
runningJobs: Int! # Number of running jobs
|
||||||
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
|
shortJobs: Int! # Number of jobs with a duration of less than config'd ShortRunningJobsDuration
|
||||||
totalNodes: Int! # Sum of the nodes of all matched jobs
|
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
|
||||||
totalNodeHours: Int! # Sum of the node hours of all matched jobs
|
totalNodes: Int! # Sum of the nodes of all matched jobs
|
||||||
totalCores: Int! # Sum of the cores of all matched jobs
|
totalNodeHours: Int! # Sum of the node hours of all matched jobs
|
||||||
totalCoreHours: Int! # Sum of the core hours of all matched jobs
|
totalCores: Int! # Sum of the cores of all matched jobs
|
||||||
totalAccs: Int! # Sum of the accs of all matched jobs
|
totalCoreHours: Int! # Sum of the core hours of all matched jobs
|
||||||
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
|
totalAccs: Int! # Sum of the accs of all matched jobs
|
||||||
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
|
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
|
||||||
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
|
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
|
||||||
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
|
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
|
||||||
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
|
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
|
||||||
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
|
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
|
||||||
|
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
|
||||||
}
|
}
|
||||||
|
|
||||||
input PageRequest {
|
input PageRequest {
|
||||||
itemsPerPage: Int!
|
itemsPerPage: Int!
|
||||||
page: Int!
|
page: Int!
|
||||||
}
|
}
|
||||||
|
|||||||
1795
api/swagger.json
1795
api/swagger.json
File diff suppressed because it is too large
Load Diff
1258
api/swagger.yaml
1258
api/swagger.yaml
File diff suppressed because it is too large
Load Diff
38
cmd/cc-backend/cli.go
Normal file
38
cmd/cc-backend/cli.go
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package main provides the entry point for the ClusterCockpit backend server.
|
||||||
|
// This file defines all command-line flags and their default values.
|
||||||
|
package main
|
||||||
|
|
||||||
|
import "flag"
|
||||||
|
|
||||||
|
var (
|
||||||
|
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
|
||||||
|
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags bool
|
||||||
|
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
|
||||||
|
)
|
||||||
|
|
||||||
|
func cliInit() {
|
||||||
|
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize sqlite database file, config.json and .env")
|
||||||
|
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
|
||||||
|
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'hpc_user' table with ldap")
|
||||||
|
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
|
||||||
|
flag.BoolVar(&flagGops, "gops", false, "Listen via github.com/google/gops/agent (for debugging)")
|
||||||
|
flag.BoolVar(&flagDev, "dev", false, "Enable development components: GraphQL Playground and Swagger UI")
|
||||||
|
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
|
||||||
|
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
|
||||||
|
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
|
||||||
|
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
|
||||||
|
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
|
||||||
|
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
|
||||||
|
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
|
||||||
|
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")
|
||||||
|
flag.StringVar(&flagDelUser, "del-user", "", "Remove a existing user. Argument format: <username>")
|
||||||
|
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
|
||||||
|
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
|
||||||
|
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info , warn (default), err, crit]`")
|
||||||
|
flag.Parse()
|
||||||
|
}
|
||||||
94
cmd/cc-backend/init.go
Normal file
94
cmd/cc-backend/init.go
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package main provides the entry point for the ClusterCockpit backend server.
|
||||||
|
// This file contains bootstrap logic for initializing the environment,
|
||||||
|
// creating default configuration files, and setting up the database.
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||||
|
)
|
||||||
|
|
||||||
|
const envString = `
|
||||||
|
# Base64 encoded Ed25519 keys (DO NOT USE THESE TWO IN PRODUCTION!)
|
||||||
|
# You can generate your own keypair using the gen-keypair tool
|
||||||
|
JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
|
||||||
|
JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
|
||||||
|
|
||||||
|
# Some random bytes used as secret for cookie-based sessions (DO NOT USE THIS ONE IN PRODUCTION)
|
||||||
|
SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
|
||||||
|
`
|
||||||
|
|
||||||
|
const configString = `
|
||||||
|
{
|
||||||
|
"main": {
|
||||||
|
"addr": "127.0.0.1:8080",
|
||||||
|
"short-running-jobs-duration": 300,
|
||||||
|
"resampling": {
|
||||||
|
"minimum-points": 600,
|
||||||
|
"trigger": 300,
|
||||||
|
"resolutions": [
|
||||||
|
240,
|
||||||
|
60
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"api-allowed-ips": [
|
||||||
|
"*"
|
||||||
|
],
|
||||||
|
"emission-constant": 317
|
||||||
|
},
|
||||||
|
"cron": {
|
||||||
|
"commit-job-worker": "1m",
|
||||||
|
"duration-worker": "5m",
|
||||||
|
"footprint-worker": "10m"
|
||||||
|
},
|
||||||
|
"archive": {
|
||||||
|
"kind": "file",
|
||||||
|
"path": "./var/job-archive"
|
||||||
|
},
|
||||||
|
"auth": {
|
||||||
|
"jwts": {
|
||||||
|
"max-age": "2000h"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
|
||||||
|
func initEnv() {
|
||||||
|
if util.CheckFileExists("var") {
|
||||||
|
cclog.Exit("Directory ./var already exists. Cautiously exiting application initialization.")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile("config.json", []byte(configString), 0o666); err != nil {
|
||||||
|
cclog.Abortf("Could not write default ./config.json with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile(".env", []byte(envString), 0o666); err != nil {
|
||||||
|
cclog.Abortf("Could not write default ./.env file with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.Mkdir("var", 0o777); err != nil {
|
||||||
|
cclog.Abortf("Could not create default ./var folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
err := repository.MigrateDB("./var/job.db")
|
||||||
|
if err != nil {
|
||||||
|
cclog.Abortf("Could not initialize default SQLite database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||||
|
}
|
||||||
|
if err := os.Mkdir("var/job-archive", 0o777); err != nil {
|
||||||
|
cclog.Abortf("Could not create default ./var/job-archive folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||||
|
}
|
||||||
|
archiveCfg := "{\"kind\": \"file\",\"path\": \"./var/job-archive\"}"
|
||||||
|
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||||
|
cclog.Abortf("Could not initialize job-archive, exited.\nError: %s\n", err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
437
cmd/cc-backend/server.go
Normal file
437
cmd/cc-backend/server.go
Normal file
@@ -0,0 +1,437 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package main provides the entry point for the ClusterCockpit backend server.
|
||||||
|
// This file contains HTTP server setup, routing configuration, and
|
||||||
|
// authentication middleware integration.
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"crypto/tls"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/99designs/gqlgen/graphql/handler"
|
||||||
|
"github.com/99designs/gqlgen/graphql/handler/transport"
|
||||||
|
"github.com/99designs/gqlgen/graphql/playground"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/web"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/nats"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/runtime"
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
|
"github.com/go-chi/chi/v5/middleware"
|
||||||
|
"github.com/go-chi/cors"
|
||||||
|
httpSwagger "github.com/swaggo/http-swagger"
|
||||||
|
)
|
||||||
|
|
||||||
|
var buildInfo web.Build
|
||||||
|
|
||||||
|
// Environment variable names
|
||||||
|
const (
|
||||||
|
envDebug = "DEBUG"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Server encapsulates the HTTP server state and dependencies
|
||||||
|
type Server struct {
|
||||||
|
router chi.Router
|
||||||
|
server *http.Server
|
||||||
|
restAPIHandle *api.RestAPI
|
||||||
|
natsAPIHandle *api.NatsAPI
|
||||||
|
}
|
||||||
|
|
||||||
|
func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
|
||||||
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
|
rw.WriteHeader(http.StatusUnauthorized)
|
||||||
|
json.NewEncoder(rw).Encode(map[string]string{
|
||||||
|
"status": http.StatusText(http.StatusUnauthorized),
|
||||||
|
"error": err.Error(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewServer creates and initializes a new Server instance
|
||||||
|
func NewServer(version, commit, buildDate string) (*Server, error) {
|
||||||
|
buildInfo = web.Build{Version: version, Hash: commit, Buildtime: buildDate}
|
||||||
|
|
||||||
|
s := &Server{
|
||||||
|
router: chi.NewRouter(),
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := s.init(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Server) init() error {
|
||||||
|
// Setup the http.Handler/Router used by the server
|
||||||
|
graph.Init()
|
||||||
|
resolver := graph.GetResolverInstance()
|
||||||
|
graphQLServer := handler.New(
|
||||||
|
generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
|
||||||
|
|
||||||
|
graphQLServer.AddTransport(transport.POST{})
|
||||||
|
|
||||||
|
if os.Getenv(envDebug) != "1" {
|
||||||
|
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
|
||||||
|
// The problem with this is that then, no more stacktrace is printed to stderr.
|
||||||
|
graphQLServer.SetRecoverFunc(func(ctx context.Context, err any) error {
|
||||||
|
switch e := err.(type) {
|
||||||
|
case string:
|
||||||
|
return fmt.Errorf("MAIN > Panic: %s", e)
|
||||||
|
case error:
|
||||||
|
return fmt.Errorf("MAIN > Panic caused by: %s", e.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
return errors.New("MAIN > Internal server error (panic)")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
authHandle := auth.GetAuthInstance()
|
||||||
|
|
||||||
|
// Middleware must be defined before routes in chi
|
||||||
|
s.router.Use(func(next http.Handler) http.Handler {
|
||||||
|
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
start := time.Now()
|
||||||
|
ww := middleware.NewWrapResponseWriter(rw, r.ProtoMajor)
|
||||||
|
next.ServeHTTP(ww, r)
|
||||||
|
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||||
|
r.Method, r.URL.RequestURI(),
|
||||||
|
ww.Status(), float32(ww.BytesWritten())/1024,
|
||||||
|
time.Since(start).Milliseconds())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
s.router.Use(middleware.Compress(5))
|
||||||
|
s.router.Use(middleware.Recoverer)
|
||||||
|
s.router.Use(cors.Handler(cors.Options{
|
||||||
|
AllowCredentials: true,
|
||||||
|
AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"},
|
||||||
|
AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"},
|
||||||
|
AllowedOrigins: []string{"*"},
|
||||||
|
}))
|
||||||
|
|
||||||
|
s.restAPIHandle = api.New()
|
||||||
|
|
||||||
|
info := map[string]any{}
|
||||||
|
info["hasOpenIDConnect"] = false
|
||||||
|
|
||||||
|
if auth.Keys.OpenIDConfig != nil {
|
||||||
|
openIDConnect := auth.NewOIDC(authHandle)
|
||||||
|
openIDConnect.RegisterEndpoints(s.router)
|
||||||
|
info["hasOpenIDConnect"] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
s.router.Get("/login", func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||||
|
cclog.Debugf("##%v##", info)
|
||||||
|
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
|
||||||
|
})
|
||||||
|
s.router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||||
|
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
|
||||||
|
})
|
||||||
|
s.router.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||||
|
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
|
||||||
|
})
|
||||||
|
|
||||||
|
if !config.Keys.DisableAuthentication {
|
||||||
|
// Create login failure handler (used by both /login and /jwt-login)
|
||||||
|
loginFailureHandler := func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||||
|
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||||
|
rw.WriteHeader(http.StatusUnauthorized)
|
||||||
|
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||||
|
Title: "Login failed - ClusterCockpit",
|
||||||
|
MsgType: "alert-warning",
|
||||||
|
Message: err.Error(),
|
||||||
|
Build: buildInfo,
|
||||||
|
Infos: info,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
s.router.Post("/login", authHandle.Login(loginFailureHandler).ServeHTTP)
|
||||||
|
s.router.HandleFunc("/jwt-login", authHandle.Login(loginFailureHandler).ServeHTTP)
|
||||||
|
|
||||||
|
s.router.Post("/logout", authHandle.Logout(
|
||||||
|
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||||
|
rw.WriteHeader(http.StatusOK)
|
||||||
|
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||||
|
Title: "Bye - ClusterCockpit",
|
||||||
|
MsgType: "alert-info",
|
||||||
|
Message: "Logout successful",
|
||||||
|
Build: buildInfo,
|
||||||
|
Infos: info,
|
||||||
|
})
|
||||||
|
})).ServeHTTP)
|
||||||
|
}
|
||||||
|
|
||||||
|
if flagDev {
|
||||||
|
s.router.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
|
||||||
|
s.router.Get("/swagger/*", httpSwagger.Handler(
|
||||||
|
httpSwagger.URL("http://"+config.Keys.Addr+"/swagger/doc.json")))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Secured routes (require authentication)
|
||||||
|
s.router.Group(func(secured chi.Router) {
|
||||||
|
if !config.Keys.DisableAuthentication {
|
||||||
|
secured.Use(func(next http.Handler) http.Handler {
|
||||||
|
return authHandle.Auth(
|
||||||
|
next,
|
||||||
|
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||||
|
rw.WriteHeader(http.StatusUnauthorized)
|
||||||
|
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||||
|
Title: "Authentication failed - ClusterCockpit",
|
||||||
|
MsgType: "alert-danger",
|
||||||
|
Message: err.Error(),
|
||||||
|
Build: buildInfo,
|
||||||
|
Infos: info,
|
||||||
|
Redirect: r.RequestURI,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
secured.Handle("/query", graphQLServer)
|
||||||
|
|
||||||
|
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
routerConfig.HandleSearchBar(rw, r, buildInfo)
|
||||||
|
})
|
||||||
|
|
||||||
|
routerConfig.SetupRoutes(secured, buildInfo)
|
||||||
|
})
|
||||||
|
|
||||||
|
// API routes (JWT token auth)
|
||||||
|
s.router.Route("/api", func(apiRouter chi.Router) {
|
||||||
|
// Main API routes with API auth
|
||||||
|
apiRouter.Group(func(securedapi chi.Router) {
|
||||||
|
if !config.Keys.DisableAuthentication {
|
||||||
|
securedapi.Use(func(next http.Handler) http.Handler {
|
||||||
|
return authHandle.AuthAPI(next, onFailureResponse)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
s.restAPIHandle.MountAPIRoutes(securedapi)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Metric store API routes with separate auth
|
||||||
|
apiRouter.Group(func(metricstoreapi chi.Router) {
|
||||||
|
if !config.Keys.DisableAuthentication {
|
||||||
|
metricstoreapi.Use(func(next http.Handler) http.Handler {
|
||||||
|
return authHandle.AuthMetricStoreAPI(next, onFailureResponse)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
// User API routes
|
||||||
|
s.router.Route("/userapi", func(userapi chi.Router) {
|
||||||
|
if !config.Keys.DisableAuthentication {
|
||||||
|
userapi.Use(func(next http.Handler) http.Handler {
|
||||||
|
return authHandle.AuthUserAPI(next, onFailureResponse)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
s.restAPIHandle.MountUserAPIRoutes(userapi)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Config API routes (uses Group with full paths to avoid shadowing
|
||||||
|
// the /config page route that is registered in the secured group)
|
||||||
|
s.router.Group(func(configapi chi.Router) {
|
||||||
|
if !config.Keys.DisableAuthentication {
|
||||||
|
configapi.Use(func(next http.Handler) http.Handler {
|
||||||
|
return authHandle.AuthConfigAPI(next, onFailureResponse)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
s.restAPIHandle.MountConfigAPIRoutes(configapi)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Frontend API routes
|
||||||
|
s.router.Route("/frontend", func(frontendapi chi.Router) {
|
||||||
|
if !config.Keys.DisableAuthentication {
|
||||||
|
frontendapi.Use(func(next http.Handler) http.Handler {
|
||||||
|
return authHandle.AuthFrontendAPI(next, onFailureResponse)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
s.restAPIHandle.MountFrontendAPIRoutes(frontendapi)
|
||||||
|
})
|
||||||
|
|
||||||
|
if config.Keys.APISubjects != nil {
|
||||||
|
s.natsAPIHandle = api.NewNatsAPI()
|
||||||
|
if err := s.natsAPIHandle.StartSubscriptions(); err != nil {
|
||||||
|
return fmt.Errorf("starting NATS subscriptions: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 404 handler for pages and API routes
|
||||||
|
notFoundHandler := func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
if strings.HasPrefix(r.URL.Path, "/api/") || strings.HasPrefix(r.URL.Path, "/userapi/") ||
|
||||||
|
strings.HasPrefix(r.URL.Path, "/frontend/") || strings.HasPrefix(r.URL.Path, "/config/") {
|
||||||
|
rw.Header().Set("Content-Type", "application/json")
|
||||||
|
rw.WriteHeader(http.StatusNotFound)
|
||||||
|
json.NewEncoder(rw).Encode(map[string]string{
|
||||||
|
"status": "Resource not found",
|
||||||
|
"error": "the requested endpoint does not exist",
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
rw.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
rw.WriteHeader(http.StatusNotFound)
|
||||||
|
web.RenderTemplate(rw, "404.tmpl", &web.Page{
|
||||||
|
Title: "Page Not Found",
|
||||||
|
Build: buildInfo,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set NotFound on the router so chi uses it for all unmatched routes,
|
||||||
|
// including those under subrouters like /api, /userapi, /frontend, etc.
|
||||||
|
s.router.NotFound(notFoundHandler)
|
||||||
|
|
||||||
|
if config.Keys.EmbedStaticFiles {
|
||||||
|
if i, err := os.Stat("./var/img"); err == nil {
|
||||||
|
if i.IsDir() {
|
||||||
|
cclog.Info("Use local directory for static images")
|
||||||
|
s.router.Handle("/img/*", http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fileServer := http.StripPrefix("/", web.ServeFiles())
|
||||||
|
s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
if web.StaticFileExists(r.URL.Path) {
|
||||||
|
fileServer.ServeHTTP(rw, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
notFoundHandler(rw, r)
|
||||||
|
}))
|
||||||
|
} else {
|
||||||
|
staticDir := http.Dir(config.Keys.StaticFiles)
|
||||||
|
fileServer := http.FileServer(staticDir)
|
||||||
|
s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
f, err := staticDir.Open(r.URL.Path)
|
||||||
|
if err == nil {
|
||||||
|
f.Close()
|
||||||
|
fileServer.ServeHTTP(rw, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
notFoundHandler(rw, r)
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Server timeout defaults (in seconds)
|
||||||
|
const (
|
||||||
|
defaultReadTimeout = 20
|
||||||
|
defaultWriteTimeout = 20
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *Server) Start(ctx context.Context) error {
|
||||||
|
// Use configurable timeouts with defaults
|
||||||
|
readTimeout := time.Duration(defaultReadTimeout) * time.Second
|
||||||
|
writeTimeout := time.Duration(defaultWriteTimeout) * time.Second
|
||||||
|
|
||||||
|
s.server = &http.Server{
|
||||||
|
ReadTimeout: readTimeout,
|
||||||
|
WriteTimeout: writeTimeout,
|
||||||
|
Handler: s.router,
|
||||||
|
Addr: config.Keys.Addr,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start http or https server
|
||||||
|
listener, err := net.Listen("tcp", config.Keys.Addr)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("starting listener on '%s': %w", config.Keys.Addr, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHTTPTo != "" {
|
||||||
|
go func() {
|
||||||
|
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently))
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.Keys.HTTPSCertFile != "" && config.Keys.HTTPSKeyFile != "" {
|
||||||
|
cert, err := tls.LoadX509KeyPair(
|
||||||
|
config.Keys.HTTPSCertFile, config.Keys.HTTPSKeyFile)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("loading X509 keypair (check 'https-cert-file' and 'https-key-file' in config.json): %w", err)
|
||||||
|
}
|
||||||
|
listener = tls.NewListener(listener, &tls.Config{
|
||||||
|
Certificates: []tls.Certificate{cert},
|
||||||
|
CipherSuites: []uint16{
|
||||||
|
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
|
||||||
|
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
|
||||||
|
},
|
||||||
|
MinVersion: tls.VersionTLS12,
|
||||||
|
PreferServerCipherSuites: true,
|
||||||
|
})
|
||||||
|
cclog.Infof("HTTPS server listening at %s...", config.Keys.Addr)
|
||||||
|
} else {
|
||||||
|
cclog.Infof("HTTP server listening at %s...", config.Keys.Addr)
|
||||||
|
}
|
||||||
|
//
|
||||||
|
// Because this program will want to bind to a privileged port (like 80), the listener must
|
||||||
|
// be established first, then the user can be changed, and after that,
|
||||||
|
// the actual http server can be started.
|
||||||
|
if err := runtime.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
|
||||||
|
return fmt.Errorf("dropping privileges: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle context cancellation for graceful shutdown
|
||||||
|
go func() {
|
||||||
|
<-ctx.Done()
|
||||||
|
shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
if err := s.server.Shutdown(shutdownCtx); err != nil {
|
||||||
|
cclog.Errorf("Server shutdown error: %v", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if err = s.server.Serve(listener); err != nil && err != http.ErrServerClosed {
|
||||||
|
return fmt.Errorf("server failed: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Server) Shutdown(ctx context.Context) {
|
||||||
|
// Create a shutdown context with timeout
|
||||||
|
shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
nc := nats.GetClient()
|
||||||
|
if nc != nil {
|
||||||
|
nc.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// First shut down the server gracefully (waiting for all ongoing requests)
|
||||||
|
if err := s.server.Shutdown(shutdownCtx); err != nil {
|
||||||
|
cclog.Errorf("Server shutdown error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Archive all the metric store data
|
||||||
|
ms := metricstore.GetMemoryStore()
|
||||||
|
|
||||||
|
if ms != nil {
|
||||||
|
metricstore.Shutdown()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutdown archiver with 10 second timeout for fast shutdown
|
||||||
|
if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||||
|
cclog.Warnf("Archiver shutdown: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,56 +1,26 @@
|
|||||||
{
|
{
|
||||||
"addr": "127.0.0.1:8080",
|
"main": {
|
||||||
"archive": {
|
"addr": "127.0.0.1:8080"
|
||||||
"kind": "file",
|
},
|
||||||
"path": "./var/job-archive"
|
"cron": {
|
||||||
},
|
"commit-job-worker": "1m",
|
||||||
|
"duration-worker": "3m",
|
||||||
|
"footprint-worker": "5m"
|
||||||
|
},
|
||||||
|
"auth": {
|
||||||
"jwts": {
|
"jwts": {
|
||||||
"max-age": "2000h"
|
"max-age": "2000h"
|
||||||
},
|
}
|
||||||
"clusters": [
|
},
|
||||||
{
|
"metric-store-external": [
|
||||||
"name": "fritz",
|
{
|
||||||
"metricDataRepository": {
|
"scope": "fritz",
|
||||||
"kind": "cc-metric-store",
|
"url": "http://0.0.0.0:8082",
|
||||||
"url": "http://localhost:8082",
|
"token": "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NzU3Nzg4NDQsImlhdCI6MTc2ODU3ODg0NCwicm9sZXMiOlsiYWRtaW4iLCJhcGkiXSwic3ViIjoiZGVtbyJ9._SDEW9WaUVXSBFmWqGhyIZXLoqoDU8F1hkfh4cXKIqF4yw7w50IUpfUBtwUFUOnoviFKoi563f6RAMC7XxeLDA"
|
||||||
"token": ""
|
}
|
||||||
},
|
],
|
||||||
"filterRanges": {
|
"metric-store": {
|
||||||
"numNodes": {
|
"retention-in-memory": "24h",
|
||||||
"from": 1,
|
"memory-cap": 100
|
||||||
"to": 64
|
}
|
||||||
},
|
|
||||||
"duration": {
|
|
||||||
"from": 0,
|
|
||||||
"to": 86400
|
|
||||||
},
|
|
||||||
"startTime": {
|
|
||||||
"from": "2022-01-01T00:00:00Z",
|
|
||||||
"to": null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "alex",
|
|
||||||
"metricDataRepository": {
|
|
||||||
"kind": "cc-metric-store",
|
|
||||||
"url": "http://localhost:8082",
|
|
||||||
"token": ""
|
|
||||||
},
|
|
||||||
"filterRanges": {
|
|
||||||
"numNodes": {
|
|
||||||
"from": 1,
|
|
||||||
"to": 64
|
|
||||||
},
|
|
||||||
"duration": {
|
|
||||||
"from": 0,
|
|
||||||
"to": 86400
|
|
||||||
},
|
|
||||||
"startTime": {
|
|
||||||
"from": "2022-01-01T00:00:00Z",
|
|
||||||
"to": null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,50 +1,97 @@
|
|||||||
{
|
{
|
||||||
|
"main": {
|
||||||
"addr": "0.0.0.0:443",
|
"addr": "0.0.0.0:443",
|
||||||
"ldap": {
|
|
||||||
"url": "ldaps://test",
|
|
||||||
"user_base": "ou=people,ou=hpc,dc=test,dc=de",
|
|
||||||
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=test,dc=de",
|
|
||||||
"user_bind": "uid={username},ou=people,ou=hpc,dc=test,dc=de",
|
|
||||||
"user_filter": "(&(objectclass=posixAccount))"
|
|
||||||
},
|
|
||||||
"https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem",
|
"https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem",
|
||||||
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
|
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
|
||||||
"user": "clustercockpit",
|
"user": "clustercockpit",
|
||||||
"group": "clustercockpit",
|
"group": "clustercockpit",
|
||||||
"archive": {
|
"api-allowed-ips": ["*"],
|
||||||
"kind": "file",
|
"short-running-jobs-duration": 300,
|
||||||
"path": "./var/job-archive"
|
"enable-job-taggers": true,
|
||||||
|
"nodestate-retention": {
|
||||||
|
"policy": "move",
|
||||||
|
"target-kind": "file",
|
||||||
|
"target-path": "./var/nodestate-archive"
|
||||||
},
|
},
|
||||||
"validate": true,
|
"resampling": {
|
||||||
"clusters": [
|
"minimum-points": 600,
|
||||||
{
|
"trigger": 180,
|
||||||
"name": "test",
|
"resolutions": [240, 60]
|
||||||
"metricDataRepository": {
|
},
|
||||||
"kind": "cc-metric-store",
|
"api-subjects": {
|
||||||
"url": "http://localhost:8082",
|
"subject-job-event": "cc.job.event",
|
||||||
"token": "eyJhbGciOiJF-E-pQBQ"
|
"subject-node-state": "cc.node.state"
|
||||||
},
|
}
|
||||||
"filterRanges": {
|
},
|
||||||
"numNodes": {
|
"nats": {
|
||||||
"from": 1,
|
"address": "nats://0.0.0.0:4222",
|
||||||
"to": 64
|
"username": "root",
|
||||||
},
|
"password": "root"
|
||||||
"duration": {
|
},
|
||||||
"from": 0,
|
"auth": {
|
||||||
"to": 86400
|
|
||||||
},
|
|
||||||
"startTime": {
|
|
||||||
"from": "2022-01-01T00:00:00Z",
|
|
||||||
"to": null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"jwts": {
|
"jwts": {
|
||||||
"cookieName": "",
|
"max-age": "2000h"
|
||||||
"validateUser": false,
|
}
|
||||||
"max-age": "2000h",
|
},
|
||||||
"trustedIssuer": ""
|
"cron": {
|
||||||
|
"commit-job-worker": "1m",
|
||||||
|
"duration-worker": "5m",
|
||||||
|
"footprint-worker": "10m"
|
||||||
|
},
|
||||||
|
"archive": {
|
||||||
|
"kind": "s3",
|
||||||
|
"endpoint": "http://x.x.x.x",
|
||||||
|
"bucket": "jobarchive",
|
||||||
|
"access-key": "xx",
|
||||||
|
"secret-key": "xx",
|
||||||
|
"retention": {
|
||||||
|
"policy": "move",
|
||||||
|
"age": 365,
|
||||||
|
"location": "./var/archive"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metric-store-external": [
|
||||||
|
{
|
||||||
|
"scope": "*",
|
||||||
|
"url": "http://x.x.x.x:8082",
|
||||||
|
"token": "MySecret"
|
||||||
},
|
},
|
||||||
"short-running-jobs-duration": 300
|
{
|
||||||
|
"scope": "fritz",
|
||||||
|
"url": "http://x.x.x.x:8084",
|
||||||
|
"token": "MySecret"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"scope": "fritz-spr1tb",
|
||||||
|
"url": "http://x.x.x.x:8083",
|
||||||
|
"token": "MySecret"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"scope": "alex",
|
||||||
|
"url": "http://x.x.x.x:8084",
|
||||||
|
"token": "MySecret"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metric-store": {
|
||||||
|
"checkpoints": {
|
||||||
|
"directory": "./var/checkpoints"
|
||||||
|
},
|
||||||
|
"memory-cap": 100,
|
||||||
|
"retention-in-memory": "24h",
|
||||||
|
"cleanup": {
|
||||||
|
"mode": "archive",
|
||||||
|
"directory": "./var/archive"
|
||||||
|
},
|
||||||
|
"nats-subscriptions": [
|
||||||
|
{
|
||||||
|
"subscribe-to": "hpc-nats",
|
||||||
|
"cluster-tag": "fritz"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"subscribe-to": "hpc-nats",
|
||||||
|
"cluster-tag": "alex"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"ui-file": "ui-config.json"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -117,10 +117,12 @@ foreach my $ln (split("\n", $topo)) {
|
|||||||
|
|
||||||
my $node;
|
my $node;
|
||||||
my @sockets;
|
my @sockets;
|
||||||
|
my @nodeCores;
|
||||||
foreach my $socket ( @{$DOMAINS{socket}} ) {
|
foreach my $socket ( @{$DOMAINS{socket}} ) {
|
||||||
push @sockets, "[".join(",", @{$socket})."]";
|
push @sockets, "[".join(",", @{$socket})."]";
|
||||||
$node .= join(",", @{$socket})
|
push @nodeCores, join(",", @{$socket});
|
||||||
}
|
}
|
||||||
|
$node = join(",", @nodeCores);
|
||||||
$INFO{sockets} = join(",\n", @sockets);
|
$INFO{sockets} = join(",\n", @sockets);
|
||||||
|
|
||||||
my @memDomains;
|
my @memDomains;
|
||||||
@@ -212,9 +214,27 @@ print <<"END";
|
|||||||
"socketsPerNode": $INFO{socketsPerNode},
|
"socketsPerNode": $INFO{socketsPerNode},
|
||||||
"coresPerSocket": $INFO{coresPerSocket},
|
"coresPerSocket": $INFO{coresPerSocket},
|
||||||
"threadsPerCore": $INFO{threadsPerCore},
|
"threadsPerCore": $INFO{threadsPerCore},
|
||||||
"flopRateScalar": $flopsScalar,
|
"flopRateScalar": {
|
||||||
"flopRateSimd": $flopsSimd,
|
"unit": {
|
||||||
"memoryBandwidth": $memBw,
|
"base": "F/s",
|
||||||
|
"prefix": "G"
|
||||||
|
},
|
||||||
|
"value": $flopsScalar
|
||||||
|
},
|
||||||
|
"flopRateSimd": {
|
||||||
|
"unit": {
|
||||||
|
"base": "F/s",
|
||||||
|
"prefix": "G"
|
||||||
|
},
|
||||||
|
"value": $flopsSimd
|
||||||
|
},
|
||||||
|
"memoryBandwidth": {
|
||||||
|
"unit": {
|
||||||
|
"base": "B/s",
|
||||||
|
"prefix": "G"
|
||||||
|
},
|
||||||
|
"value": $memBw
|
||||||
|
},
|
||||||
"nodes": "<FILL IN NODE RANGES>",
|
"nodes": "<FILL IN NODE RANGES>",
|
||||||
"topology": {
|
"topology": {
|
||||||
"node": [$node],
|
"node": [$node],
|
||||||
|
|||||||
22
configs/startJobPayload.json
Normal file
22
configs/startJobPayload.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"cluster": "fritz",
|
||||||
|
"jobId": 123000,
|
||||||
|
"jobState": "running",
|
||||||
|
"numAcc": 0,
|
||||||
|
"numHwthreads": 72,
|
||||||
|
"numNodes": 1,
|
||||||
|
"partition": "main",
|
||||||
|
"requestedMemory": 128000,
|
||||||
|
"resources": [{ "hostname": "f0726" }],
|
||||||
|
"startTime": 1649723812,
|
||||||
|
"subCluster": "main",
|
||||||
|
"submitTime": 1649723812,
|
||||||
|
"user": "k106eb10",
|
||||||
|
"project": "k106eb",
|
||||||
|
"walltime": 86400,
|
||||||
|
"metaData": {
|
||||||
|
"slurmInfo": "JobId=398759\nJobName=myJob\nUserId=dummyUser\nGroupId=dummyGroup\nAccount=dummyAccount\nQOS=normal Requeue=False Restarts=0 BatchFlag=True\nTimeLimit=1439'\nSubmitTime=2023-02-09T14:10:18\nPartition=singlenode\nNodeList=xx\nNumNodes=xx NumCPUs=72 NumTasks=72 CPUs/Task=1\nNTasksPerNode:Socket:Core=0:None:None\nTRES_req=cpu=72,mem=250000M,node=1,billing=72\nTRES_alloc=cpu=72,node=1,billing=72\nCommand=myCmd\nWorkDir=myDir\nStdErr=\nStdOut=\n",
|
||||||
|
"jobScript": "#!/bin/bash -l\n#SBATCH --job-name=dummy_job\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/dummy/\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\n\n#This is a dummy job script\n./mybinary\n",
|
||||||
|
"jobName": "ams_pipeline"
|
||||||
|
}
|
||||||
|
}
|
||||||
7
configs/stopJobPayload.json
Normal file
7
configs/stopJobPayload.json
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"cluster": "fritz",
|
||||||
|
"jobId": 123000,
|
||||||
|
"jobState": "completed",
|
||||||
|
"startTime": 1649723812,
|
||||||
|
"stopTime": 1649763839
|
||||||
|
}
|
||||||
419
configs/tagger/README.md
Normal file
419
configs/tagger/README.md
Normal file
@@ -0,0 +1,419 @@
|
|||||||
|
# Job Tagging Configuration
|
||||||
|
|
||||||
|
ClusterCockpit provides automatic job tagging functionality to classify and
|
||||||
|
categorize jobs based on configurable rules. The tagging system consists of two
|
||||||
|
main components:
|
||||||
|
|
||||||
|
1. **Application Detection** - Identifies which application a job is running
|
||||||
|
2. **Job Classification** - Analyzes job performance characteristics and applies classification tags
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
configs/tagger/
|
||||||
|
├── apps/ # Application detection patterns
|
||||||
|
│ ├── vasp.txt
|
||||||
|
│ ├── gromacs.txt
|
||||||
|
│ └── ...
|
||||||
|
└── jobclasses/ # Job classification rules
|
||||||
|
├── parameters.json
|
||||||
|
├── lowUtilization.json
|
||||||
|
├── highload.json
|
||||||
|
└── ...
|
||||||
|
```
|
||||||
|
|
||||||
|
## Activating Tagger Rules
|
||||||
|
|
||||||
|
### Step 1: Copy Configuration Files
|
||||||
|
|
||||||
|
To activate tagging, review, adapt, and copy the configuration files from
|
||||||
|
`configs/tagger/` to `var/tagger/`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# From the cc-backend root directory
|
||||||
|
mkdir -p var/tagger
|
||||||
|
cp -r configs/tagger/apps var/tagger/
|
||||||
|
cp -r configs/tagger/jobclasses var/tagger/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Enable Tagging in Configuration
|
||||||
|
|
||||||
|
Add or set the following configuration key in the `main` section of your `config.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"enable-job-taggers": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Important**: Automatic tagging is disabled by default. You must explicitly
|
||||||
|
enable it by setting `enable-job-taggers: true` in the main configuration file.
|
||||||
|
|
||||||
|
### Step 3: Restart cc-backend
|
||||||
|
|
||||||
|
The tagger system automatically loads configuration from `./var/tagger/` at
|
||||||
|
startup. After copying the files and enabling the feature, restart cc-backend:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./cc-backend -server
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Verify Configuration Loaded
|
||||||
|
|
||||||
|
Check the logs for messages indicating successful configuration loading:
|
||||||
|
|
||||||
|
```
|
||||||
|
[INFO] Setup file watch for ./var/tagger/apps
|
||||||
|
[INFO] Setup file watch for ./var/tagger/jobclasses
|
||||||
|
```
|
||||||
|
|
||||||
|
## How Tagging Works
|
||||||
|
|
||||||
|
### Automatic Tagging
|
||||||
|
|
||||||
|
When `enable-job-taggers` is set to `true` in the configuration, tags are
|
||||||
|
automatically applied when:
|
||||||
|
|
||||||
|
- **Job Start**: Application detection runs immediately when a job starts
|
||||||
|
- **Job Stop**: Job classification runs when a job completes
|
||||||
|
|
||||||
|
The system analyzes job metadata and metrics to determine appropriate tags.
|
||||||
|
|
||||||
|
**Note**: Automatic tagging only works for jobs that start or stop after the
|
||||||
|
feature is enabled. Existing jobs are not automatically retagged.
|
||||||
|
|
||||||
|
### Manual Tagging (Retroactive)
|
||||||
|
|
||||||
|
To apply tags to existing jobs in the database, use the `-apply-tags` command
|
||||||
|
line option:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./cc-backend -apply-tags
|
||||||
|
```
|
||||||
|
|
||||||
|
This processes all jobs in the database and applies current tagging rules. This
|
||||||
|
is useful when:
|
||||||
|
|
||||||
|
- You have existing jobs that were created before tagging was enabled
|
||||||
|
- You've added new tagging rules and want to apply them to historical data
|
||||||
|
- You've modified existing rules and want to re-evaluate all jobs
|
||||||
|
|
||||||
|
### Hot Reload
|
||||||
|
|
||||||
|
The tagger system watches the configuration directories for changes. You can
|
||||||
|
modify or add rules without restarting `cc-backend`:
|
||||||
|
|
||||||
|
- Changes to `var/tagger/apps/*` are detected automatically
|
||||||
|
- Changes to `var/tagger/jobclasses/*` are detected automatically
|
||||||
|
|
||||||
|
## Application Detection
|
||||||
|
|
||||||
|
Application detection identifies which software a job is running by matching
|
||||||
|
patterns in the job script.
|
||||||
|
|
||||||
|
### Configuration Format
|
||||||
|
|
||||||
|
Application patterns are stored in text files under `var/tagger/apps/`. Each
|
||||||
|
file contains one or more regular expression patterns (one per line) that match
|
||||||
|
against the job script.
|
||||||
|
|
||||||
|
**Example: `apps/vasp.txt`**
|
||||||
|
|
||||||
|
```
|
||||||
|
vasp
|
||||||
|
VASP
|
||||||
|
```
|
||||||
|
|
||||||
|
### How It Works
|
||||||
|
|
||||||
|
1. When a job starts, the system retrieves the job script from metadata
|
||||||
|
2. Each line in the app files is treated as a regex pattern
|
||||||
|
3. Patterns are matched case-insensitively against the lowercased job script
|
||||||
|
4. If a match is found, a tag of type `app` with the filename (without extension) is applied
|
||||||
|
5. Only the first matching application is tagged
|
||||||
|
|
||||||
|
### Adding New Applications
|
||||||
|
|
||||||
|
1. Create a new file in `var/tagger/apps/` (e.g., `tensorflow.txt`)
|
||||||
|
2. Add regex patterns, one per line:
|
||||||
|
|
||||||
|
```
|
||||||
|
tensorflow
|
||||||
|
tf\.keras
|
||||||
|
import tensorflow
|
||||||
|
```
|
||||||
|
|
||||||
|
3. The file is automatically detected and loaded
|
||||||
|
|
||||||
|
**Note**: The tag name will be the filename without the `.txt` extension (e.g., `tensorflow`).
|
||||||
|
|
||||||
|
## Job Classification
|
||||||
|
|
||||||
|
Job classification analyzes completed jobs based on their metrics and properties
|
||||||
|
to identify performance issues or characteristics.
|
||||||
|
|
||||||
|
### Configuration Format
|
||||||
|
|
||||||
|
Job classification rules are defined in JSON files under
|
||||||
|
`var/tagger/jobclasses/`. Each rule file defines:
|
||||||
|
|
||||||
|
- **Metrics required**: Which job metrics to analyze
|
||||||
|
- **Requirements**: Pre-conditions that must be met
|
||||||
|
- **Variables**: Computed values used in the rule
|
||||||
|
- **Rule expression**: Boolean expression that determines if the rule matches
|
||||||
|
- **Hint template**: Message displayed when the rule matches
|
||||||
|
|
||||||
|
### Parameters File
|
||||||
|
|
||||||
|
`jobclasses/parameters.json` defines shared threshold values used across multiple rules:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"lowcpuload_threshold_factor": 0.9,
|
||||||
|
"highmemoryusage_threshold_factor": 0.9,
|
||||||
|
"job_min_duration_seconds": 600.0,
|
||||||
|
"sampling_interval_seconds": 30.0
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rule File Structure
|
||||||
|
|
||||||
|
**Example: `jobclasses/lowUtilization.json`**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"name": "Low resource utilization",
|
||||||
|
"tag": "lowutilization",
|
||||||
|
"parameters": ["job_min_duration_seconds"],
|
||||||
|
"metrics": ["flops_any", "mem_bw"],
|
||||||
|
"requirements": [
|
||||||
|
"job.shared == \"none\"",
|
||||||
|
"job.duration > job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "mem_bw_perc",
|
||||||
|
"expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "flops_any.avg < flops_any.limits.alert",
|
||||||
|
"hint": "Average flop rate {{.flops_any.avg}} falls below threshold {{.flops_any.limits.alert}}"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Field Descriptions
|
||||||
|
|
||||||
|
| Field | Description |
|
||||||
|
| -------------- | ----------------------------------------------------------------------------- |
|
||||||
|
| `name` | Human-readable description of the rule |
|
||||||
|
| `tag` | Tag identifier applied when the rule matches |
|
||||||
|
| `parameters` | List of parameter names from `parameters.json` to include in rule environment |
|
||||||
|
| `metrics` | List of metrics required for evaluation (must be present in job data) |
|
||||||
|
| `requirements` | Boolean expressions that must all be true for the rule to be evaluated |
|
||||||
|
| `variables` | Named expressions computed before evaluating the main rule |
|
||||||
|
| `rule` | Boolean expression that determines if the job matches this classification |
|
||||||
|
| `hint` | Go template string for generating a user-visible message |
|
||||||
|
|
||||||
|
### Expression Environment
|
||||||
|
|
||||||
|
Expressions in `requirements`, `variables`, and `rule` have access to:
|
||||||
|
|
||||||
|
**Job Properties:**
|
||||||
|
|
||||||
|
- `job.shared` - Shared node allocation type
|
||||||
|
- `job.duration` - Job runtime in seconds
|
||||||
|
- `job.numCores` - Number of CPU cores
|
||||||
|
- `job.numNodes` - Number of nodes
|
||||||
|
- `job.jobState` - Job completion state
|
||||||
|
- `job.numAcc` - Number of accelerators
|
||||||
|
- `job.smt` - SMT setting
|
||||||
|
|
||||||
|
**Metric Statistics (for each metric in `metrics`):**
|
||||||
|
|
||||||
|
- `<metric>.min` - Minimum value
|
||||||
|
- `<metric>.max` - Maximum value
|
||||||
|
- `<metric>.avg` - Average value
|
||||||
|
- `<metric>.limits.peak` - Peak limit from cluster config
|
||||||
|
- `<metric>.limits.normal` - Normal threshold
|
||||||
|
- `<metric>.limits.caution` - Caution threshold
|
||||||
|
- `<metric>.limits.alert` - Alert threshold
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
|
||||||
|
- All parameters listed in the `parameters` field
|
||||||
|
|
||||||
|
**Variables:**
|
||||||
|
|
||||||
|
- All variables defined in the `variables` array
|
||||||
|
|
||||||
|
### Expression Language
|
||||||
|
|
||||||
|
Rules use the [expr](https://github.com/expr-lang/expr) language for expressions. Supported operations:
|
||||||
|
|
||||||
|
- **Arithmetic**: `+`, `-`, `*`, `/`, `%`, `^`
|
||||||
|
- **Comparison**: `==`, `!=`, `<`, `<=`, `>`, `>=`
|
||||||
|
- **Logical**: `&&`, `||`, `!`
|
||||||
|
- **Functions**: Standard math functions (see expr documentation)
|
||||||
|
|
||||||
|
### Hint Templates
|
||||||
|
|
||||||
|
Hints use Go's `text/template` syntax. Variables from the evaluation environment are accessible:
|
||||||
|
|
||||||
|
```
|
||||||
|
{{.flops_any.avg}} # Access metric average
|
||||||
|
{{.job.duration}} # Access job property
|
||||||
|
{{.my_variable}} # Access computed variable
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adding New Classification Rules
|
||||||
|
|
||||||
|
1. Create a new JSON file in `var/tagger/jobclasses/` (e.g., `memoryLeak.json`)
|
||||||
|
2. Define the rule structure:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"name": "Memory Leak Detection",
|
||||||
|
"tag": "memory_leak",
|
||||||
|
"parameters": ["memory_leak_slope_threshold"],
|
||||||
|
"metrics": ["mem_used"],
|
||||||
|
"requirements": ["job.duration > 3600"],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "mem_growth",
|
||||||
|
"expr": "(mem_used.max - mem_used.min) / job.duration"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "mem_growth > memory_leak_slope_threshold",
|
||||||
|
"hint": "Memory usage grew by {{.mem_growth}} per second"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Add any new parameters to `parameters.json`
|
||||||
|
4. The file is automatically detected and loaded
|
||||||
|
|
||||||
|
## Configuration Paths
|
||||||
|
|
||||||
|
The tagger system reads from these paths (relative to cc-backend working directory):
|
||||||
|
|
||||||
|
- **Application patterns**: `./var/tagger/apps/`
|
||||||
|
- **Job classification rules**: `./var/tagger/jobclasses/`
|
||||||
|
|
||||||
|
These paths are defined as constants in the source code and cannot be changed without recompiling.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Tags Not Applied
|
||||||
|
|
||||||
|
1. **Check tagging is enabled**: Verify `enable-job-taggers: true` is set in `config.json`
|
||||||
|
|
||||||
|
2. **Check configuration exists**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ls -la var/tagger/apps
|
||||||
|
ls -la var/tagger/jobclasses
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Check logs for errors**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./cc-backend -server -loglevel debug
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Verify file permissions**: Ensure cc-backend can read the configuration files
|
||||||
|
|
||||||
|
5. **For existing jobs**: Use `./cc-backend -apply-tags` to retroactively tag jobs
|
||||||
|
|
||||||
|
### Rules Not Matching
|
||||||
|
|
||||||
|
1. **Enable debug logging**: Set `loglevel: debug` to see detailed rule evaluation
|
||||||
|
2. **Check requirements**: Ensure all requirements in the rule are satisfied
|
||||||
|
3. **Verify metrics exist**: Classification rules require job metrics to be available
|
||||||
|
4. **Check metric names**: Ensure metric names match those in your cluster configuration
|
||||||
|
|
||||||
|
### File Watch Not Working
|
||||||
|
|
||||||
|
If changes to configuration files aren't detected:
|
||||||
|
|
||||||
|
1. Restart cc-backend to reload all configuration
|
||||||
|
2. Check filesystem supports file watching (network filesystems may not)
|
||||||
|
3. Check logs for file watch setup messages
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Start Simple**: Begin with basic rules and refine based on results
|
||||||
|
2. **Use Requirements**: Filter out irrelevant jobs early with requirements
|
||||||
|
3. **Test Incrementally**: Add one rule at a time and verify behavior
|
||||||
|
4. **Document Rules**: Use descriptive names and clear hint messages
|
||||||
|
5. **Share Parameters**: Define common thresholds in `parameters.json` for consistency
|
||||||
|
6. **Version Control**: Keep your `var/tagger/` configuration in version control
|
||||||
|
7. **Backup Before Changes**: Test new rules on a copy before deploying to production
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Simple Application Detection
|
||||||
|
|
||||||
|
**File: `var/tagger/apps/python.txt`**
|
||||||
|
|
||||||
|
```
|
||||||
|
python
|
||||||
|
python3
|
||||||
|
\.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This detects jobs running Python scripts.
|
||||||
|
|
||||||
|
### Complex Classification Rule
|
||||||
|
|
||||||
|
**File: `var/tagger/jobclasses/cpuImbalance.json`**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"name": "CPU Load Imbalance",
|
||||||
|
"tag": "cpu_imbalance",
|
||||||
|
"parameters": ["core_load_imbalance_threshold_factor"],
|
||||||
|
"metrics": ["cpu_load"],
|
||||||
|
"requirements": ["job.numCores > 1", "job.duration > 600"],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "load_variance",
|
||||||
|
"expr": "(cpu_load.max - cpu_load.min) / cpu_load.avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "load_variance > core_load_imbalance_threshold_factor",
|
||||||
|
"hint": "CPU load varies by {{printf \"%.1f%%\" (load_variance * 100)}} across cores"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This detects jobs where CPU load is unevenly distributed across cores.
|
||||||
|
|
||||||
|
## Reference
|
||||||
|
|
||||||
|
### Configuration Options
|
||||||
|
|
||||||
|
**Main Configuration (`config.json`)**:
|
||||||
|
|
||||||
|
- `enable-job-taggers` (boolean, default: `false`) - Enables automatic job tagging system
|
||||||
|
- Must be set to `true` to activate automatic tagging on job start/stop events
|
||||||
|
- Does not affect the `-apply-tags` command line option
|
||||||
|
|
||||||
|
**Command Line Options**:
|
||||||
|
|
||||||
|
- `-apply-tags` - Apply all tagging rules to existing jobs in the database
|
||||||
|
- Works independently of `enable-job-taggers` configuration
|
||||||
|
- Useful for retroactively tagging jobs or re-evaluating with updated rules
|
||||||
|
|
||||||
|
### Default Configuration Location
|
||||||
|
|
||||||
|
The example configurations are provided in:
|
||||||
|
|
||||||
|
- `configs/tagger/apps/` - Example application patterns (16 applications)
|
||||||
|
- `configs/tagger/jobclasses/` - Example classification rules (3 rules)
|
||||||
|
|
||||||
|
Copy these to `var/tagger/` and customize for your environment.
|
||||||
|
|
||||||
|
### Tag Types
|
||||||
|
|
||||||
|
- `app` - Application tags (e.g., "vasp", "gromacs")
|
||||||
|
- `jobClass` - Classification tags (e.g., "lowutilization", "highload")
|
||||||
|
|
||||||
|
Tags can be queried and filtered in the ClusterCockpit UI and API.
|
||||||
1
configs/tagger/apps/alf.txt
Normal file
1
configs/tagger/apps/alf.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
alf
|
||||||
6
configs/tagger/apps/caracal.txt
Normal file
6
configs/tagger/apps/caracal.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
calc_rate
|
||||||
|
qmdffgen
|
||||||
|
dynamic
|
||||||
|
evbopt
|
||||||
|
black_box
|
||||||
|
poly_qmdff
|
||||||
3
configs/tagger/apps/chroma.txt
Normal file
3
configs/tagger/apps/chroma.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
chroma
|
||||||
|
qdp
|
||||||
|
qmp
|
||||||
1
configs/tagger/apps/cp2k.txt
Normal file
1
configs/tagger/apps/cp2k.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
cp2k
|
||||||
1
configs/tagger/apps/cpmd.txt
Normal file
1
configs/tagger/apps/cpmd.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
cpmd
|
||||||
1
configs/tagger/apps/flame.txt
Normal file
1
configs/tagger/apps/flame.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
flame
|
||||||
3
configs/tagger/apps/gromacs.txt
Normal file
3
configs/tagger/apps/gromacs.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
gromacs
|
||||||
|
gmx
|
||||||
|
mdrun
|
||||||
1
configs/tagger/apps/julia.txt
Normal file
1
configs/tagger/apps/julia.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
julia
|
||||||
1
configs/tagger/apps/lammps.txt
Normal file
1
configs/tagger/apps/lammps.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
\blmp\s+
|
||||||
1
configs/tagger/apps/matlab.txt
Normal file
1
configs/tagger/apps/matlab.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
matlab
|
||||||
1
configs/tagger/apps/openfoam.txt
Normal file
1
configs/tagger/apps/openfoam.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
openfoam
|
||||||
1
configs/tagger/apps/orca.txt
Normal file
1
configs/tagger/apps/orca.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
orca
|
||||||
4
configs/tagger/apps/python.txt
Normal file
4
configs/tagger/apps/python.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
python
|
||||||
|
pip
|
||||||
|
anaconda
|
||||||
|
conda
|
||||||
2
configs/tagger/apps/starccm.txt
Normal file
2
configs/tagger/apps/starccm.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
starccm+
|
||||||
|
-podkey
|
||||||
10
configs/tagger/apps/turbomole.txt
Normal file
10
configs/tagger/apps/turbomole.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
dscf
|
||||||
|
grad
|
||||||
|
ridft
|
||||||
|
rdgrad
|
||||||
|
ricc2
|
||||||
|
statpt
|
||||||
|
aoforce
|
||||||
|
escf
|
||||||
|
egrad
|
||||||
|
odft
|
||||||
3
configs/tagger/apps/vasp.txt
Normal file
3
configs/tagger/apps/vasp.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
vasp_gam
|
||||||
|
vasp_ncl
|
||||||
|
vasp_std
|
||||||
21
configs/tagger/jobclasses/highMemoryUsage.json
Normal file
21
configs/tagger/jobclasses/highMemoryUsage.json
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
{
|
||||||
|
"name": "High memory usage",
|
||||||
|
"tag": "highmemory",
|
||||||
|
"parameters": [
|
||||||
|
"highmemoryusage_threshold_factor",
|
||||||
|
"job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"metrics": ["mem_used"],
|
||||||
|
"requirements": [
|
||||||
|
"job.shared == \"none\"",
|
||||||
|
"job.duration > job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "memory_usage_pct",
|
||||||
|
"expr": "mem_used.max / mem_used.limits.peak * 100.0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "mem_used.max > mem_used.limits.alert",
|
||||||
|
"hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions."
|
||||||
|
}
|
||||||
21
configs/tagger/jobclasses/highload.json
Normal file
21
configs/tagger/jobclasses/highload.json
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
{
|
||||||
|
"name": "Excessive CPU load",
|
||||||
|
"tag": "excessiveload",
|
||||||
|
"parameters": [
|
||||||
|
"excessivecpuload_threshold_factor",
|
||||||
|
"job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"metrics": ["cpu_load"],
|
||||||
|
"requirements": [
|
||||||
|
"job.shared == \"none\"",
|
||||||
|
"job.duration > job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "load_threshold",
|
||||||
|
"expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "cpu_load.avg > load_threshold",
|
||||||
|
"hint": "This job was detected as having excessive CPU load: average cpu load {{.cpu_load.avg}} exceeds the oversubscription threshold {{.load_threshold}} ({{.excessivecpuload_threshold_factor}} \u00d7 {{.cpu_load.limits.peak}} peak cores), indicating CPU contention."
|
||||||
|
}
|
||||||
22
configs/tagger/jobclasses/lowUtilization.json
Normal file
22
configs/tagger/jobclasses/lowUtilization.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"name": "Low resource utilization",
|
||||||
|
"tag": "lowutilization",
|
||||||
|
"parameters": ["job_min_duration_seconds"],
|
||||||
|
"metrics": ["flops_any", "mem_bw"],
|
||||||
|
"requirements": [
|
||||||
|
"job.shared == \"none\"",
|
||||||
|
"job.duration > job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "mem_bw_pct",
|
||||||
|
"expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "flops_any_pct",
|
||||||
|
"expr": "flops_any.avg / flops_any.limits.peak * 100.0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert",
|
||||||
|
"hint": "This job shows low resource utilization: FLOP rate {{.flops_any.avg}} GF/s ({{.flops_any_pct}}% of peak) and memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) are both below their alert thresholds."
|
||||||
|
}
|
||||||
18
configs/tagger/jobclasses/lowload.json
Normal file
18
configs/tagger/jobclasses/lowload.json
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
{
|
||||||
|
"name": "Low CPU load",
|
||||||
|
"tag": "lowload",
|
||||||
|
"parameters": ["lowcpuload_threshold_factor", "job_min_duration_seconds"],
|
||||||
|
"metrics": ["cpu_load"],
|
||||||
|
"requirements": [
|
||||||
|
"job.shared == \"none\"",
|
||||||
|
"job.duration > job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "load_threshold",
|
||||||
|
"expr": "cpu_load.limits.peak * lowcpuload_threshold_factor"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "cpu_load.avg < load_threshold",
|
||||||
|
"hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}})."
|
||||||
|
}
|
||||||
22
configs/tagger/jobclasses/memoryBound.json
Normal file
22
configs/tagger/jobclasses/memoryBound.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"name": "Memory bandwidth bound",
|
||||||
|
"tag": "memorybound",
|
||||||
|
"parameters": ["membound_bw_threshold_factor", "job_min_duration_seconds"],
|
||||||
|
"metrics": ["mem_bw"],
|
||||||
|
"requirements": [
|
||||||
|
"job.shared == \"none\"",
|
||||||
|
"job.duration > job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "mem_bw_threshold",
|
||||||
|
"expr": "mem_bw.limits.peak * membound_bw_threshold_factor"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "mem_bw_pct",
|
||||||
|
"expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "mem_bw.avg > mem_bw_threshold",
|
||||||
|
"hint": "This job is memory bandwidth bound: memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) is within {{.membound_bw_threshold_factor}} of peak bandwidth. Consider improving data reuse or compute intensity."
|
||||||
|
}
|
||||||
15
configs/tagger/jobclasses/parameters.json
Normal file
15
configs/tagger/jobclasses/parameters.json
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"lowcpuload_threshold_factor": 0.85,
|
||||||
|
"excessivecpuload_threshold_factor": 1.2,
|
||||||
|
"highmemoryusage_threshold_factor": 0.9,
|
||||||
|
"node_load_imbalance_threshold_factor": 0.1,
|
||||||
|
"core_load_imbalance_threshold_factor": 0.1,
|
||||||
|
"high_memory_load_threshold_factor": 0.9,
|
||||||
|
"lowgpuload_threshold_factor": 0.7,
|
||||||
|
"membound_bw_threshold_factor": 0.8,
|
||||||
|
"memory_leak_slope_threshold": 0.1,
|
||||||
|
"job_min_duration_seconds": 600.0,
|
||||||
|
"sampling_interval_seconds": 30.0,
|
||||||
|
"cpu_load_pre_cutoff_samples": 11.0,
|
||||||
|
"cpu_load_core_pre_cutoff_samples": 6.0
|
||||||
|
}
|
||||||
45
configs/uiConfig.json
Normal file
45
configs/uiConfig.json
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
{
|
||||||
|
"job-list": {
|
||||||
|
"use-paging": false,
|
||||||
|
"show-footprint":false
|
||||||
|
},
|
||||||
|
"job-view": {
|
||||||
|
"show-polar-plot": true,
|
||||||
|
"show-footprint": true,
|
||||||
|
"show-roofline": true,
|
||||||
|
"show-stat-table": true
|
||||||
|
},
|
||||||
|
"metric-config": {
|
||||||
|
"job-list-metrics": ["mem_bw", "flops_dp"],
|
||||||
|
"job-view-plot-metrics": ["mem_bw", "flops_dp"],
|
||||||
|
"job-view-table-metrics": ["mem_bw", "flops_dp"],
|
||||||
|
"clusters": [
|
||||||
|
{
|
||||||
|
"name": "test",
|
||||||
|
"sub-clusters": [
|
||||||
|
{
|
||||||
|
"name": "one",
|
||||||
|
"job-list-metrics": ["mem_used", "flops_sp"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"node-list": {
|
||||||
|
"use-paging": true
|
||||||
|
},
|
||||||
|
"plot-configuration": {
|
||||||
|
"plots-per-row": 3,
|
||||||
|
"color-background": true,
|
||||||
|
"line-width": 3,
|
||||||
|
"color-scheme": [
|
||||||
|
"#00bfff",
|
||||||
|
"#0000ff",
|
||||||
|
"#ff00ff",
|
||||||
|
"#ff0000",
|
||||||
|
"#ff8000",
|
||||||
|
"#ffff00",
|
||||||
|
"#80ff00"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
181
go.mod
181
go.mod
@@ -1,92 +1,123 @@
|
|||||||
module github.com/ClusterCockpit/cc-backend
|
module github.com/ClusterCockpit/cc-backend
|
||||||
|
|
||||||
go 1.18
|
go 1.25.0
|
||||||
|
|
||||||
require (
|
tool (
|
||||||
github.com/99designs/gqlgen v0.17.45
|
github.com/99designs/gqlgen
|
||||||
github.com/ClusterCockpit/cc-units v0.4.0
|
github.com/swaggo/swag/cmd/swag
|
||||||
github.com/Masterminds/squirrel v1.5.3
|
|
||||||
github.com/coreos/go-oidc/v3 v3.9.0
|
|
||||||
github.com/go-co-op/gocron v1.25.0
|
|
||||||
github.com/go-ldap/ldap/v3 v3.4.4
|
|
||||||
github.com/go-sql-driver/mysql v1.7.0
|
|
||||||
github.com/golang-jwt/jwt/v5 v5.2.1
|
|
||||||
github.com/golang-migrate/migrate/v4 v4.15.2
|
|
||||||
github.com/google/gops v0.3.27
|
|
||||||
github.com/gorilla/handlers v1.5.1
|
|
||||||
github.com/gorilla/mux v1.8.0
|
|
||||||
github.com/gorilla/sessions v1.2.1
|
|
||||||
github.com/influxdata/influxdb-client-go/v2 v2.12.2
|
|
||||||
github.com/jmoiron/sqlx v1.3.5
|
|
||||||
github.com/mattn/go-sqlite3 v1.14.16
|
|
||||||
github.com/prometheus/client_golang v1.14.0
|
|
||||||
github.com/prometheus/common v0.40.0
|
|
||||||
github.com/qustavo/sqlhooks/v2 v2.1.0
|
|
||||||
github.com/santhosh-tekuri/jsonschema/v5 v5.2.0
|
|
||||||
github.com/swaggo/http-swagger v1.3.3
|
|
||||||
github.com/swaggo/swag v1.16.3
|
|
||||||
github.com/vektah/gqlparser/v2 v2.5.11
|
|
||||||
golang.org/x/crypto v0.21.0
|
|
||||||
golang.org/x/exp v0.0.0-20230510235704-dd950f8aeaea
|
|
||||||
golang.org/x/oauth2 v0.13.0
|
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
|
github.com/99designs/gqlgen v0.17.87
|
||||||
|
github.com/ClusterCockpit/cc-lib/v2 v2.8.0
|
||||||
|
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0
|
||||||
|
github.com/Masterminds/squirrel v1.5.4
|
||||||
|
github.com/aws/aws-sdk-go-v2 v1.41.2
|
||||||
|
github.com/aws/aws-sdk-go-v2/config v1.32.10
|
||||||
|
github.com/aws/aws-sdk-go-v2/credentials v1.19.10
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/s3 v1.96.2
|
||||||
|
github.com/coreos/go-oidc/v3 v3.17.0
|
||||||
|
github.com/expr-lang/expr v1.17.8
|
||||||
|
github.com/go-chi/chi/v5 v5.2.5
|
||||||
|
github.com/go-chi/cors v1.2.2
|
||||||
|
github.com/go-co-op/gocron/v2 v2.19.1
|
||||||
|
github.com/go-ldap/ldap/v3 v3.4.12
|
||||||
|
github.com/golang-jwt/jwt/v5 v5.3.1
|
||||||
|
github.com/golang-migrate/migrate/v4 v4.19.1
|
||||||
|
github.com/google/gops v0.3.29
|
||||||
|
github.com/gorilla/sessions v1.4.0
|
||||||
|
github.com/jmoiron/sqlx v1.4.0
|
||||||
|
github.com/joho/godotenv v1.5.1
|
||||||
|
github.com/mattn/go-sqlite3 v1.14.34
|
||||||
|
github.com/parquet-go/parquet-go v0.28.0
|
||||||
|
github.com/qustavo/sqlhooks/v2 v2.1.0
|
||||||
|
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
|
||||||
|
github.com/stretchr/testify v1.11.1
|
||||||
|
github.com/swaggo/http-swagger v1.3.4
|
||||||
|
github.com/swaggo/swag v1.16.6
|
||||||
|
github.com/vektah/gqlparser/v2 v2.5.32
|
||||||
|
golang.org/x/crypto v0.48.0
|
||||||
|
golang.org/x/oauth2 v0.35.0
|
||||||
|
golang.org/x/time v0.14.0
|
||||||
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/Azure/go-ntlmssp v0.1.0 // indirect
|
||||||
github.com/KyleBanks/depth v1.2.1 // indirect
|
github.com/KyleBanks/depth v1.2.1 // indirect
|
||||||
github.com/agnivade/levenshtein v1.1.1 // indirect
|
github.com/agnivade/levenshtein v1.2.1 // indirect
|
||||||
|
github.com/andybalholm/brotli v1.2.0 // indirect
|
||||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||||
github.com/beorn7/perks v1.0.1 // indirect
|
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.5 // indirect
|
||||||
github.com/cespare/xxhash/v2 v2.2.0 // indirect
|
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.18 // indirect
|
||||||
github.com/containerd/containerd v1.6.26 // indirect
|
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.18 // indirect
|
||||||
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
|
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.18 // indirect
|
||||||
github.com/deepmap/oapi-codegen v1.12.4 // indirect
|
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect
|
||||||
github.com/felixge/httpsnoop v1.0.3 // indirect
|
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.18 // indirect
|
||||||
github.com/go-asn1-ber/asn1-ber v1.5.4 // indirect
|
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.5 // indirect
|
||||||
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
|
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.10 // indirect
|
||||||
github.com/go-openapi/jsonpointer v0.21.0 // indirect
|
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.18 // indirect
|
||||||
github.com/go-openapi/jsonreference v0.21.0 // indirect
|
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.18 // indirect
|
||||||
github.com/go-openapi/spec v0.21.0 // indirect
|
github.com/aws/aws-sdk-go-v2/service/signin v1.0.6 // indirect
|
||||||
github.com/go-openapi/swag v0.23.0 // indirect
|
github.com/aws/aws-sdk-go-v2/service/sso v1.30.11 // indirect
|
||||||
github.com/golang/protobuf v1.5.3 // indirect
|
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.15 // indirect
|
||||||
|
github.com/aws/aws-sdk-go-v2/service/sts v1.41.7 // indirect
|
||||||
|
github.com/aws/smithy-go v1.24.2 // indirect
|
||||||
|
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
|
||||||
|
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||||
|
github.com/fsnotify/fsnotify v1.9.0 // indirect
|
||||||
|
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 // indirect
|
||||||
|
github.com/go-jose/go-jose/v4 v4.1.3 // indirect
|
||||||
|
github.com/go-openapi/jsonpointer v0.22.5 // indirect
|
||||||
|
github.com/go-openapi/jsonreference v0.21.5 // indirect
|
||||||
|
github.com/go-openapi/spec v0.22.4 // indirect
|
||||||
|
github.com/go-openapi/swag/conv v0.25.5 // indirect
|
||||||
|
github.com/go-openapi/swag/jsonname v0.25.5 // indirect
|
||||||
|
github.com/go-openapi/swag/jsonutils v0.25.5 // indirect
|
||||||
|
github.com/go-openapi/swag/loading v0.25.5 // indirect
|
||||||
|
github.com/go-openapi/swag/stringutils v0.25.5 // indirect
|
||||||
|
github.com/go-openapi/swag/typeutils v0.25.5 // indirect
|
||||||
|
github.com/go-openapi/swag/yamlutils v0.25.5 // indirect
|
||||||
|
github.com/go-viper/mapstructure/v2 v2.5.0 // indirect
|
||||||
|
github.com/goccy/go-yaml v1.19.2 // indirect
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
github.com/gorilla/securecookie v1.1.1 // indirect
|
github.com/gorilla/securecookie v1.1.2 // indirect
|
||||||
github.com/gorilla/websocket v1.5.0 // indirect
|
github.com/gorilla/websocket v1.5.3 // indirect
|
||||||
github.com/hashicorp/errwrap v1.1.0 // indirect
|
|
||||||
github.com/hashicorp/go-multierror v1.1.1 // indirect
|
|
||||||
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
|
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
|
||||||
|
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
|
||||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
|
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
|
||||||
github.com/josharian/intern v1.0.0 // indirect
|
github.com/jonboulle/clockwork v0.5.0 // indirect
|
||||||
github.com/jpillora/backoff v1.0.0 // indirect
|
github.com/klauspost/compress v1.18.4 // indirect
|
||||||
github.com/json-iterator/go v1.1.12 // indirect
|
github.com/kr/pretty v0.3.1 // indirect
|
||||||
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
|
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
|
||||||
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
|
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
|
||||||
github.com/mailru/easyjson v0.7.7 // indirect
|
github.com/nats-io/nats.go v1.49.0 // indirect
|
||||||
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
|
github.com/nats-io/nkeys v0.4.15 // indirect
|
||||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
github.com/nats-io/nuid v1.0.1 // indirect
|
||||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
github.com/oapi-codegen/runtime v1.2.0 // indirect
|
||||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
github.com/parquet-go/bitpack v1.0.0 // indirect
|
||||||
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
|
github.com/parquet-go/jsonlite v1.4.0 // indirect
|
||||||
github.com/opencontainers/image-spec v1.1.0-rc2.0.20221005185240-3a7f492d3f1b // indirect
|
github.com/pierrec/lz4/v4 v4.1.25 // indirect
|
||||||
github.com/pkg/errors v0.9.1 // indirect
|
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||||
github.com/prometheus/client_model v0.3.0 // indirect
|
|
||||||
github.com/prometheus/procfs v0.9.0 // indirect
|
|
||||||
github.com/robfig/cron/v3 v3.0.1 // indirect
|
github.com/robfig/cron/v3 v3.0.1 // indirect
|
||||||
|
github.com/rogpeppe/go-internal v1.10.0 // indirect
|
||||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||||
github.com/sosodev/duration v1.2.0 // indirect
|
github.com/sosodev/duration v1.4.0 // indirect
|
||||||
github.com/swaggo/files v1.0.0 // indirect
|
github.com/stmcginnis/gofish v0.21.4 // indirect
|
||||||
github.com/urfave/cli/v2 v2.27.1 // indirect
|
github.com/stretchr/objx v0.5.2 // indirect
|
||||||
github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913 // indirect
|
github.com/swaggo/files v1.0.1 // indirect
|
||||||
go.uber.org/atomic v1.10.0 // indirect
|
github.com/twpayne/go-geom v1.6.1 // indirect
|
||||||
golang.org/x/mod v0.16.0 // indirect
|
github.com/urfave/cli/v2 v2.27.7 // indirect
|
||||||
golang.org/x/net v0.22.0 // indirect
|
github.com/urfave/cli/v3 v3.6.2 // indirect
|
||||||
golang.org/x/sys v0.18.0 // indirect
|
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
|
||||||
golang.org/x/text v0.14.0 // indirect
|
go.yaml.in/yaml/v2 v2.4.3 // indirect
|
||||||
golang.org/x/tools v0.19.0 // indirect
|
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
||||||
google.golang.org/appengine v1.6.8 // indirect
|
golang.org/x/mod v0.33.0 // indirect
|
||||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 // indirect
|
golang.org/x/net v0.51.0 // indirect
|
||||||
google.golang.org/protobuf v1.33.0 // indirect
|
golang.org/x/sync v0.19.0 // indirect
|
||||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
golang.org/x/sys v0.41.0 // indirect
|
||||||
|
golang.org/x/text v0.34.0 // indirect
|
||||||
|
golang.org/x/tools v0.42.0 // indirect
|
||||||
|
google.golang.org/protobuf v1.36.11 // indirect
|
||||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
sigs.k8s.io/yaml v1.4.0 // indirect
|
sigs.k8s.io/yaml v1.6.0 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
63
gqlgen.yml
63
gqlgen.yml
@@ -30,7 +30,9 @@ resolver:
|
|||||||
# gqlgen will search for any type names in the schema in these go packages
|
# gqlgen will search for any type names in the schema in these go packages
|
||||||
# if they match it will use them, otherwise it will generate them.
|
# if they match it will use them, otherwise it will generate them.
|
||||||
autobind:
|
autobind:
|
||||||
|
- "github.com/99designs/gqlgen/graphql/introspection"
|
||||||
- "github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
- "github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||||
|
- "github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
|
|
||||||
# This section declares type mapping between the GraphQL and go type systems
|
# This section declares type mapping between the GraphQL and go type systems
|
||||||
#
|
#
|
||||||
@@ -50,34 +52,51 @@ models:
|
|||||||
- github.com/99designs/gqlgen/graphql.Int64
|
- github.com/99designs/gqlgen/graphql.Int64
|
||||||
- github.com/99designs/gqlgen/graphql.Int32
|
- github.com/99designs/gqlgen/graphql.Int32
|
||||||
Job:
|
Job:
|
||||||
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Job"
|
model: "github.com/ClusterCockpit/cc-lib/v2/schema.Job"
|
||||||
fields:
|
fields:
|
||||||
tags:
|
tags:
|
||||||
resolver: true
|
resolver: true
|
||||||
metaData:
|
metaData:
|
||||||
resolver: true
|
resolver: true
|
||||||
Cluster:
|
Cluster:
|
||||||
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Cluster"
|
model: "github.com/ClusterCockpit/cc-lib/v2/schema.Cluster"
|
||||||
fields:
|
fields:
|
||||||
partitions:
|
partitions:
|
||||||
resolver: true
|
resolver: true
|
||||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
|
# Node:
|
||||||
MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
|
# model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node"
|
||||||
MetricValue: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
|
# fields:
|
||||||
JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
|
# metaData:
|
||||||
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
|
# resolver: true
|
||||||
Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
|
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Float" }
|
||||||
JobState: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
|
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricScope" }
|
||||||
TimeRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
|
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricValue" }
|
||||||
IntRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.IntRange" }
|
JobStatistics:
|
||||||
JobMetric: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobMetric" }
|
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobStatistics" }
|
||||||
Series: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Series" }
|
GlobalMetricListItem:
|
||||||
MetricStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricStatistics" }
|
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.GlobalMetricListItem" }
|
||||||
MetricConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricConfig" }
|
ClusterSupport:
|
||||||
SubClusterConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubClusterConfig" }
|
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.ClusterSupport" }
|
||||||
Accelerator: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Accelerator" }
|
Tag: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Tag" }
|
||||||
Topology: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Topology" }
|
Resource: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Resource" }
|
||||||
FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
|
JobState: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobState" }
|
||||||
SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
|
Node: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node" }
|
||||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
|
SchedulerState:
|
||||||
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }
|
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.SchedulerState" }
|
||||||
|
HealthState:
|
||||||
|
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MonitoringState" }
|
||||||
|
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobMetric" }
|
||||||
|
Series: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Series" }
|
||||||
|
MetricStatistics:
|
||||||
|
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricStatistics" }
|
||||||
|
MetricConfig:
|
||||||
|
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricConfig" }
|
||||||
|
SubClusterConfig:
|
||||||
|
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubClusterConfig" }
|
||||||
|
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Accelerator" }
|
||||||
|
Topology: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Topology" }
|
||||||
|
FilterRanges:
|
||||||
|
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.FilterRanges" }
|
||||||
|
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubCluster" }
|
||||||
|
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.StatsSeries" }
|
||||||
|
Unit: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Unit" }
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=ClusterCockpit Web Server (Go edition)
|
Description=ClusterCockpit Web Server
|
||||||
Documentation=https://github.com/ClusterCockpit/cc-backend
|
Documentation=https://github.com/ClusterCockpit/cc-backend
|
||||||
Wants=network-online.target
|
Wants=network-online.target
|
||||||
After=network-online.target
|
After=network-online.target
|
||||||
After=mariadb.service mysql.service
|
# Database is file-based SQLite - no service dependency required
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
WorkingDirectory=/opt/monitoring/cc-backend
|
WorkingDirectory=/opt/monitoring/cc-backend
|
||||||
@@ -12,7 +12,7 @@ NotifyAccess=all
|
|||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=30
|
RestartSec=30
|
||||||
TimeoutStopSec=100
|
TimeoutStopSec=100
|
||||||
ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json
|
ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json --server
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
package api_test
|
package api_test
|
||||||
@@ -14,47 +14,56 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"reflect"
|
"reflect"
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||||
"github.com/gorilla/mux"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
|
|
||||||
_ "github.com/mattn/go-sqlite3"
|
_ "github.com/mattn/go-sqlite3"
|
||||||
)
|
)
|
||||||
|
|
||||||
func setup(t *testing.T) *api.RestApi {
|
func setup(t *testing.T) *api.RestAPI {
|
||||||
|
repository.ResetConnection()
|
||||||
|
|
||||||
const testconfig = `{
|
const testconfig = `{
|
||||||
"addr": "0.0.0.0:8080",
|
"main": {
|
||||||
"validate": false,
|
"addr": "0.0.0.0:8080",
|
||||||
"archive": {
|
"validate": false,
|
||||||
"kind": "file",
|
"api-allowed-ips": [
|
||||||
"path": "./var/job-archive"
|
"*"
|
||||||
},
|
]
|
||||||
"jwts": {
|
},
|
||||||
"max-age": "2m"
|
"metric-store": {
|
||||||
|
"checkpoints": {
|
||||||
|
"interval": "12h"
|
||||||
},
|
},
|
||||||
"clusters": [
|
"retention-in-memory": "48h",
|
||||||
{
|
"memory-cap": 100
|
||||||
"name": "testcluster",
|
},
|
||||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
"archive": {
|
||||||
"filterRanges": {
|
"kind": "file",
|
||||||
"numNodes": { "from": 1, "to": 64 },
|
"path": "./var/job-archive"
|
||||||
"duration": { "from": 0, "to": 86400 },
|
},
|
||||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
"auth": {
|
||||||
}
|
"jwts": {
|
||||||
}
|
"max-age": "2m"
|
||||||
]
|
}
|
||||||
|
}
|
||||||
}`
|
}`
|
||||||
const testclusterJson = `{
|
const testclusterJSON = `{
|
||||||
"name": "testcluster",
|
"name": "testcluster",
|
||||||
"subClusters": [
|
"subClusters": [
|
||||||
{
|
{
|
||||||
@@ -110,61 +119,73 @@ func setup(t *testing.T) *api.RestApi {
|
|||||||
]
|
]
|
||||||
}`
|
}`
|
||||||
|
|
||||||
log.Init("info", true)
|
cclog.Init("info", true)
|
||||||
tmpdir := t.TempDir()
|
tmpdir := t.TempDir()
|
||||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||||
if err := os.Mkdir(jobarchive, 0777); err != nil {
|
if err := os.Mkdir(jobarchive, 0o777); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
|
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0777); err != nil {
|
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJson), 0666); err != nil {
|
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||||
err := repository.MigrateDB("sqlite3", dbfilepath)
|
err := repository.MigrateDB(dbfilepath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
||||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0666); err != nil {
|
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
config.Init(cfgFilePath)
|
ccconf.Init(cfgFilePath)
|
||||||
|
metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{}
|
||||||
|
|
||||||
|
// Load and check main configuration
|
||||||
|
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||||
|
config.Init(cfg)
|
||||||
|
} else {
|
||||||
|
cclog.Abort("Main configuration must be present")
|
||||||
|
}
|
||||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||||
|
|
||||||
repository.Connect("sqlite3", dbfilepath)
|
repository.Connect(dbfilepath)
|
||||||
db := repository.GetConnection()
|
|
||||||
|
|
||||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := metricdata.Init(config.Keys.DisableArchive); err != nil {
|
// metricstore initialization removed - it's initialized via callback in tests
|
||||||
t.Fatal(err)
|
|
||||||
|
archiver.Start(repository.GetJobRepository(), context.Background())
|
||||||
|
|
||||||
|
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||||
|
auth.Init(&cfg)
|
||||||
|
} else {
|
||||||
|
cclog.Warn("Authentication disabled due to missing configuration")
|
||||||
|
auth.Init(nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
jobRepo := repository.GetJobRepository()
|
graph.Init()
|
||||||
resolver := &graph.Resolver{DB: db.DB, Repo: jobRepo}
|
|
||||||
|
|
||||||
return &api.RestApi{
|
return api.New()
|
||||||
JobRepository: resolver.Repo,
|
|
||||||
Resolver: resolver,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func cleanup() {
|
func cleanup() {
|
||||||
// TODO: Clear all caches, reset all modules, etc...
|
if err := archiver.Shutdown(5 * time.Second); err != nil {
|
||||||
|
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -175,7 +196,6 @@ func cleanup() {
|
|||||||
func TestRestApi(t *testing.T) {
|
func TestRestApi(t *testing.T) {
|
||||||
restapi := setup(t)
|
restapi := setup(t)
|
||||||
t.Cleanup(cleanup)
|
t.Cleanup(cleanup)
|
||||||
|
|
||||||
testData := schema.JobData{
|
testData := schema.JobData{
|
||||||
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||||
schema.MetricScopeNode: {
|
schema.MetricScopeNode: {
|
||||||
@@ -192,15 +212,19 @@ func TestRestApi(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||||
return testData, nil
|
return testData, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
r := mux.NewRouter()
|
r := chi.NewRouter()
|
||||||
restapi.MountRoutes(r)
|
restapi.MountAPIRoutes(r)
|
||||||
|
|
||||||
|
var TestJobID int64 = 123
|
||||||
|
TestClusterName := "testcluster"
|
||||||
|
var TestStartTime int64 = 123456789
|
||||||
|
|
||||||
const startJobBody string = `{
|
const startJobBody string = `{
|
||||||
"jobId": 123,
|
"jobId": 123,
|
||||||
"user": "testuser",
|
"user": "testuser",
|
||||||
"project": "testproj",
|
"project": "testproj",
|
||||||
"cluster": "testcluster",
|
"cluster": "testcluster",
|
||||||
@@ -210,10 +234,9 @@ func TestRestApi(t *testing.T) {
|
|||||||
"numNodes": 1,
|
"numNodes": 1,
|
||||||
"numHwthreads": 8,
|
"numHwthreads": 8,
|
||||||
"numAcc": 0,
|
"numAcc": 0,
|
||||||
"exclusive": 1,
|
"shared": "none",
|
||||||
"monitoringStatus": 1,
|
"monitoringStatus": 1,
|
||||||
"smt": 1,
|
"smt": 1,
|
||||||
"tags": [{ "type": "testTagType", "name": "testTagName" }],
|
|
||||||
"resources": [
|
"resources": [
|
||||||
{
|
{
|
||||||
"hostname": "host123",
|
"hostname": "host123",
|
||||||
@@ -224,31 +247,37 @@ func TestRestApi(t *testing.T) {
|
|||||||
"startTime": 123456789
|
"startTime": 123456789
|
||||||
}`
|
}`
|
||||||
|
|
||||||
var dbid int64
|
const contextUserKey repository.ContextKey = "user"
|
||||||
|
contextUserValue := &schema.User{
|
||||||
|
Username: "testuser",
|
||||||
|
Projects: make([]string, 0),
|
||||||
|
Roles: []string{"user"},
|
||||||
|
AuthType: 0,
|
||||||
|
AuthSource: 2,
|
||||||
|
}
|
||||||
|
|
||||||
if ok := t.Run("StartJob", func(t *testing.T) {
|
if ok := t.Run("StartJob", func(t *testing.T) {
|
||||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
|
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
|
||||||
recorder := httptest.NewRecorder()
|
recorder := httptest.NewRecorder()
|
||||||
|
|
||||||
r.ServeHTTP(recorder, req)
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
response := recorder.Result()
|
response := recorder.Result()
|
||||||
if response.StatusCode != http.StatusCreated {
|
if response.StatusCode != http.StatusCreated {
|
||||||
t.Fatal(response.Status, recorder.Body.String())
|
t.Fatal(response.Status, recorder.Body.String())
|
||||||
}
|
}
|
||||||
|
// resolver := graph.GetResolverInstance()
|
||||||
var res api.StartJobApiResponse
|
restapi.JobRepository.SyncJobs()
|
||||||
if err := json.Unmarshal(recorder.Body.Bytes(), &res); err != nil {
|
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(res.DBID)))
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
job.Tags, err = restapi.Resolver.Job().Tags(context.Background(), job)
|
// job.Tags, err = resolver.Job().Tags(ctx, job)
|
||||||
if err != nil {
|
// if err != nil {
|
||||||
t.Fatal(err)
|
// t.Fatal(err)
|
||||||
}
|
// }
|
||||||
|
|
||||||
if job.JobID != 123 ||
|
if job.JobID != 123 ||
|
||||||
job.User != "testuser" ||
|
job.User != "testuser" ||
|
||||||
@@ -257,23 +286,20 @@ func TestRestApi(t *testing.T) {
|
|||||||
job.SubCluster != "sc1" ||
|
job.SubCluster != "sc1" ||
|
||||||
job.Partition != "default" ||
|
job.Partition != "default" ||
|
||||||
job.Walltime != 3600 ||
|
job.Walltime != 3600 ||
|
||||||
job.ArrayJobId != 0 ||
|
job.ArrayJobID != 0 ||
|
||||||
job.NumNodes != 1 ||
|
job.NumNodes != 1 ||
|
||||||
job.NumHWThreads != 8 ||
|
job.NumHWThreads != 8 ||
|
||||||
job.NumAcc != 0 ||
|
job.NumAcc != 0 ||
|
||||||
job.Exclusive != 1 ||
|
|
||||||
job.MonitoringStatus != 1 ||
|
job.MonitoringStatus != 1 ||
|
||||||
job.SMT != 1 ||
|
job.SMT != 1 ||
|
||||||
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
|
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
|
||||||
job.StartTime.Unix() != 123456789 {
|
job.StartTime != 123456789 {
|
||||||
t.Fatalf("unexpected job properties: %#v", job)
|
t.Fatalf("unexpected job properties: %#v", job)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" {
|
// if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" {
|
||||||
t.Fatalf("unexpected tags: %#v", job.Tags)
|
// t.Fatalf("unexpected tags: %#v", job.Tags)
|
||||||
}
|
// }
|
||||||
|
|
||||||
dbid = res.DBID
|
|
||||||
}); !ok {
|
}); !ok {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -289,17 +315,19 @@ func TestRestApi(t *testing.T) {
|
|||||||
|
|
||||||
var stoppedJob *schema.Job
|
var stoppedJob *schema.Job
|
||||||
if ok := t.Run("StopJob", func(t *testing.T) {
|
if ok := t.Run("StopJob", func(t *testing.T) {
|
||||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
|
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
|
||||||
recorder := httptest.NewRecorder()
|
recorder := httptest.NewRecorder()
|
||||||
|
|
||||||
r.ServeHTTP(recorder, req)
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
response := recorder.Result()
|
response := recorder.Result()
|
||||||
if response.StatusCode != http.StatusOK {
|
if response.StatusCode != http.StatusOK {
|
||||||
t.Fatal(response.Status, recorder.Body.String())
|
t.Fatal(response.Status, recorder.Body.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
restapi.JobRepository.WaitForArchiving()
|
// Archiving happens asynchronously, will be completed in cleanup
|
||||||
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(dbid)))
|
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -327,7 +355,7 @@ func TestRestApi(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
t.Run("CheckArchive", func(t *testing.T) {
|
t.Run("CheckArchive", func(t *testing.T) {
|
||||||
data, err := metricdata.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background())
|
data, err := metricdispatch.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -339,12 +367,14 @@ func TestRestApi(t *testing.T) {
|
|||||||
|
|
||||||
t.Run("CheckDoubleStart", func(t *testing.T) {
|
t.Run("CheckDoubleStart", func(t *testing.T) {
|
||||||
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!
|
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!
|
||||||
body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1)
|
body := strings.ReplaceAll(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`)
|
||||||
|
|
||||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(body)))
|
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body)))
|
||||||
recorder := httptest.NewRecorder()
|
recorder := httptest.NewRecorder()
|
||||||
|
|
||||||
r.ServeHTTP(recorder, req)
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
response := recorder.Result()
|
response := recorder.Result()
|
||||||
if response.StatusCode != http.StatusUnprocessableEntity {
|
if response.StatusCode != http.StatusUnprocessableEntity {
|
||||||
t.Fatal(response.Status, recorder.Body.String())
|
t.Fatal(response.Status, recorder.Body.String())
|
||||||
@@ -359,7 +389,7 @@ func TestRestApi(t *testing.T) {
|
|||||||
"partition": "default",
|
"partition": "default",
|
||||||
"walltime": 3600,
|
"walltime": 3600,
|
||||||
"numNodes": 1,
|
"numNodes": 1,
|
||||||
"exclusive": 1,
|
"shared": "none",
|
||||||
"monitoringStatus": 1,
|
"monitoringStatus": 1,
|
||||||
"smt": 1,
|
"smt": 1,
|
||||||
"resources": [
|
"resources": [
|
||||||
@@ -371,10 +401,12 @@ func TestRestApi(t *testing.T) {
|
|||||||
}`
|
}`
|
||||||
|
|
||||||
ok := t.Run("StartJobFailed", func(t *testing.T) {
|
ok := t.Run("StartJobFailed", func(t *testing.T) {
|
||||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
|
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
|
||||||
recorder := httptest.NewRecorder()
|
recorder := httptest.NewRecorder()
|
||||||
|
|
||||||
r.ServeHTTP(recorder, req)
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
response := recorder.Result()
|
response := recorder.Result()
|
||||||
if response.StatusCode != http.StatusCreated {
|
if response.StatusCode != http.StatusCreated {
|
||||||
t.Fatal(response.Status, recorder.Body.String())
|
t.Fatal(response.Status, recorder.Body.String())
|
||||||
@@ -384,8 +416,11 @@ func TestRestApi(t *testing.T) {
|
|||||||
t.Fatal("subtest failed")
|
t.Fatal("subtest failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
restapi.JobRepository.SyncJobs()
|
||||||
|
|
||||||
const stopJobBodyFailed string = `{
|
const stopJobBodyFailed string = `{
|
||||||
"jobId": 12345,
|
"jobId": 12345,
|
||||||
"cluster": "testcluster",
|
"cluster": "testcluster",
|
||||||
|
|
||||||
"jobState": "failed",
|
"jobState": "failed",
|
||||||
@@ -393,16 +428,18 @@ func TestRestApi(t *testing.T) {
|
|||||||
}`
|
}`
|
||||||
|
|
||||||
ok = t.Run("StopJobFailed", func(t *testing.T) {
|
ok = t.Run("StopJobFailed", func(t *testing.T) {
|
||||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
|
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
|
||||||
recorder := httptest.NewRecorder()
|
recorder := httptest.NewRecorder()
|
||||||
|
|
||||||
r.ServeHTTP(recorder, req)
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
response := recorder.Result()
|
response := recorder.Result()
|
||||||
if response.StatusCode != http.StatusOK {
|
if response.StatusCode != http.StatusOK {
|
||||||
t.Fatal(response.Status, recorder.Body.String())
|
t.Fatal(response.Status, recorder.Body.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
restapi.JobRepository.WaitForArchiving()
|
// Archiving happens asynchronously, will be completed in cleanup
|
||||||
jobid, cluster := int64(12345), "testcluster"
|
jobid, cluster := int64(12345), "testcluster"
|
||||||
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
|
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -416,4 +453,198 @@ func TestRestApi(t *testing.T) {
|
|||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("subtest failed")
|
t.Fatal("subtest failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
t.Run("GetUsedNodesNoRunning", func(t *testing.T) {
|
||||||
|
contextUserValue := &schema.User{
|
||||||
|
Username: "testuser",
|
||||||
|
Projects: make([]string, 0),
|
||||||
|
Roles: []string{"api"},
|
||||||
|
AuthType: 0,
|
||||||
|
AuthSource: 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/jobs/used_nodes?ts=123456790", nil)
|
||||||
|
recorder := httptest.NewRecorder()
|
||||||
|
|
||||||
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
|
response := recorder.Result()
|
||||||
|
if response.StatusCode != http.StatusOK {
|
||||||
|
t.Fatal(response.Status, recorder.Body.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
var result api.GetUsedNodesAPIResponse
|
||||||
|
if err := json.NewDecoder(response.Body).Decode(&result); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.UsedNodes == nil {
|
||||||
|
t.Fatal("expected usedNodes to be non-nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(result.UsedNodes) != 0 {
|
||||||
|
t.Fatalf("expected no used nodes for stopped jobs, got: %v", result.UsedNodes)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestStopJobWithReusedJobId verifies that stopping a recently started job works
|
||||||
|
// even when an older job with the same jobId exists in the job table (e.g. with
|
||||||
|
// state "failed"). This is a regression test for the bug where Find() on the job
|
||||||
|
// table would match the old job instead of the new one still in job_cache.
|
||||||
|
func TestStopJobWithReusedJobId(t *testing.T) {
|
||||||
|
restapi := setup(t)
|
||||||
|
t.Cleanup(cleanup)
|
||||||
|
|
||||||
|
testData := schema.JobData{
|
||||||
|
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||||
|
schema.MetricScopeNode: {
|
||||||
|
Unit: schema.Unit{Base: "load"},
|
||||||
|
Timestep: 60,
|
||||||
|
Series: []schema.Series{
|
||||||
|
{
|
||||||
|
Hostname: "host123",
|
||||||
|
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
|
||||||
|
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||||
|
return testData, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
r := chi.NewRouter()
|
||||||
|
restapi.MountAPIRoutes(r)
|
||||||
|
|
||||||
|
const contextUserKey repository.ContextKey = "user"
|
||||||
|
contextUserValue := &schema.User{
|
||||||
|
Username: "testuser",
|
||||||
|
Projects: make([]string, 0),
|
||||||
|
Roles: []string{"user"},
|
||||||
|
AuthType: 0,
|
||||||
|
AuthSource: 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 1: Start the first job (jobId=999)
|
||||||
|
const startJobBody1 string = `{
|
||||||
|
"jobId": 999,
|
||||||
|
"user": "testuser",
|
||||||
|
"project": "testproj",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "default",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
|
||||||
|
"startTime": 200000000
|
||||||
|
}`
|
||||||
|
|
||||||
|
if ok := t.Run("StartFirstJob", func(t *testing.T) {
|
||||||
|
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody1)))
|
||||||
|
recorder := httptest.NewRecorder()
|
||||||
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
|
if recorder.Result().StatusCode != http.StatusCreated {
|
||||||
|
t.Fatal(recorder.Result().Status, recorder.Body.String())
|
||||||
|
}
|
||||||
|
}); !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Sync to move job from cache to job table, then stop it as "failed"
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
restapi.JobRepository.SyncJobs()
|
||||||
|
|
||||||
|
const stopJobBody1 string = `{
|
||||||
|
"jobId": 999,
|
||||||
|
"startTime": 200000000,
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"jobState": "failed",
|
||||||
|
"stopTime": 200001000
|
||||||
|
}`
|
||||||
|
|
||||||
|
if ok := t.Run("StopFirstJobAsFailed", func(t *testing.T) {
|
||||||
|
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody1)))
|
||||||
|
recorder := httptest.NewRecorder()
|
||||||
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
|
if recorder.Result().StatusCode != http.StatusOK {
|
||||||
|
t.Fatal(recorder.Result().Status, recorder.Body.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
jobid, cluster := int64(999), "testcluster"
|
||||||
|
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if job.State != schema.JobStateFailed {
|
||||||
|
t.Fatalf("expected first job to be failed, got: %s", job.State)
|
||||||
|
}
|
||||||
|
}); !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for archiving to complete
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
|
||||||
|
// Step 3: Start a NEW job with the same jobId=999 but different startTime.
|
||||||
|
// This job will sit in job_cache (not yet synced).
|
||||||
|
const startJobBody2 string = `{
|
||||||
|
"jobId": 999,
|
||||||
|
"user": "testuser",
|
||||||
|
"project": "testproj",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "default",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
|
||||||
|
"startTime": 300000000
|
||||||
|
}`
|
||||||
|
|
||||||
|
if ok := t.Run("StartSecondJob", func(t *testing.T) {
|
||||||
|
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody2)))
|
||||||
|
recorder := httptest.NewRecorder()
|
||||||
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
|
if recorder.Result().StatusCode != http.StatusCreated {
|
||||||
|
t.Fatal(recorder.Result().Status, recorder.Body.String())
|
||||||
|
}
|
||||||
|
}); !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 4: Stop the second job WITHOUT syncing first.
|
||||||
|
// Before the fix, this would fail because Find() on the job table would
|
||||||
|
// match the old failed job (jobId=999) and reject with "already stopped".
|
||||||
|
const stopJobBody2 string = `{
|
||||||
|
"jobId": 999,
|
||||||
|
"startTime": 300000000,
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"jobState": "completed",
|
||||||
|
"stopTime": 300001000
|
||||||
|
}`
|
||||||
|
|
||||||
|
t.Run("StopSecondJobBeforeSync", func(t *testing.T) {
|
||||||
|
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody2)))
|
||||||
|
recorder := httptest.NewRecorder()
|
||||||
|
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||||
|
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||||
|
if recorder.Result().StatusCode != http.StatusOK {
|
||||||
|
t.Fatalf("expected stop to succeed for cached job, got: %s %s",
|
||||||
|
recorder.Result().Status, recorder.Body.String())
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
71
internal/api/cluster.go
Normal file
71
internal/api/cluster.go
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package api
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GetClustersAPIResponse model
|
||||||
|
type GetClustersAPIResponse struct {
|
||||||
|
Clusters []*schema.Cluster `json:"clusters"` // Array of clusters
|
||||||
|
}
|
||||||
|
|
||||||
|
// getClusters godoc
|
||||||
|
// @summary Lists all cluster configs
|
||||||
|
// @tags Cluster query
|
||||||
|
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
|
||||||
|
// @produce json
|
||||||
|
// @param cluster query string false "Job Cluster"
|
||||||
|
// @success 200 {object} api.GetClustersAPIResponse "Array of clusters"
|
||||||
|
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||||
|
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||||
|
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||||
|
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||||
|
// @security ApiKeyAuth
|
||||||
|
// @router /api/clusters/ [get]
|
||||||
|
func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||||
|
!user.HasRole(schema.RoleAPI) {
|
||||||
|
|
||||||
|
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
|
bw := bufio.NewWriter(rw)
|
||||||
|
defer bw.Flush()
|
||||||
|
|
||||||
|
var clusters []*schema.Cluster
|
||||||
|
|
||||||
|
if r.URL.Query().Has("cluster") {
|
||||||
|
name := r.URL.Query().Get("cluster")
|
||||||
|
cluster := archive.GetCluster(name)
|
||||||
|
if cluster == nil {
|
||||||
|
handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
clusters = append(clusters, cluster)
|
||||||
|
} else {
|
||||||
|
clusters = archive.Clusters
|
||||||
|
}
|
||||||
|
|
||||||
|
payload := GetClustersAPIResponse{
|
||||||
|
Clusters: clusters,
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.NewEncoder(bw).Encode(payload); err != nil {
|
||||||
|
handleError(err, http.StatusInternalServerError, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
1796
internal/api/docs.go
1796
internal/api/docs.go
File diff suppressed because it is too large
Load Diff
1154
internal/api/job.go
Normal file
1154
internal/api/job.go
Normal file
File diff suppressed because it is too large
Load Diff
165
internal/api/log.go
Normal file
165
internal/api/log.go
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
package api
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os/exec"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
type LogEntry struct {
|
||||||
|
Timestamp string `json:"timestamp"`
|
||||||
|
Priority int `json:"priority"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
Unit string `json:"unit"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var safePattern = regexp.MustCompile(`^[a-zA-Z0-9 :\-\.]+$`)
|
||||||
|
|
||||||
|
func (api *RestAPI) getJournalLog(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
user := repository.GetUserFromContext(r.Context())
|
||||||
|
if !user.HasRole(schema.RoleAdmin) {
|
||||||
|
handleError(fmt.Errorf("only admins are allowed to view logs"), http.StatusForbidden, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
since := r.URL.Query().Get("since")
|
||||||
|
if since == "" {
|
||||||
|
since = "1 hour ago"
|
||||||
|
}
|
||||||
|
if !safePattern.MatchString(since) {
|
||||||
|
handleError(fmt.Errorf("invalid 'since' parameter"), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
lines := 200
|
||||||
|
if l := r.URL.Query().Get("lines"); l != "" {
|
||||||
|
n, err := strconv.Atoi(l)
|
||||||
|
if err != nil || n < 1 {
|
||||||
|
handleError(fmt.Errorf("invalid 'lines' parameter"), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if n > 1000 {
|
||||||
|
n = 1000
|
||||||
|
}
|
||||||
|
lines = n
|
||||||
|
}
|
||||||
|
|
||||||
|
unit := config.Keys.SystemdUnit
|
||||||
|
if unit == "" {
|
||||||
|
unit = "clustercockpit.service"
|
||||||
|
}
|
||||||
|
|
||||||
|
args := []string{
|
||||||
|
"--output=json",
|
||||||
|
"--no-pager",
|
||||||
|
"-n", fmt.Sprintf("%d", lines),
|
||||||
|
"--since", since,
|
||||||
|
"-u", unit,
|
||||||
|
}
|
||||||
|
|
||||||
|
if level := r.URL.Query().Get("level"); level != "" {
|
||||||
|
n, err := strconv.Atoi(level)
|
||||||
|
if err != nil || n < 0 || n > 7 {
|
||||||
|
handleError(fmt.Errorf("invalid 'level' parameter (must be 0-7)"), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
args = append(args, "--priority", fmt.Sprintf("%d", n))
|
||||||
|
}
|
||||||
|
|
||||||
|
if search := r.URL.Query().Get("search"); search != "" {
|
||||||
|
if !safePattern.MatchString(search) {
|
||||||
|
handleError(fmt.Errorf("invalid 'search' parameter"), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
args = append(args, "--grep", search)
|
||||||
|
}
|
||||||
|
|
||||||
|
cclog.Debugf("calling journalctl with %s", strings.Join(args, " "))
|
||||||
|
cmd := exec.CommandContext(r.Context(), "journalctl", args...)
|
||||||
|
stdout, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
handleError(fmt.Errorf("failed to create pipe: %w", err), http.StatusInternalServerError, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
handleError(fmt.Errorf("failed to start journalctl: %w", err), http.StatusInternalServerError, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
entries := make([]LogEntry, 0, lines)
|
||||||
|
scanner := bufio.NewScanner(stdout)
|
||||||
|
for scanner.Scan() {
|
||||||
|
var raw map[string]any
|
||||||
|
if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil {
|
||||||
|
cclog.Debugf("error unmarshal log output: %v", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
priority := 6 // default info
|
||||||
|
if p, ok := raw["PRIORITY"]; ok {
|
||||||
|
switch v := p.(type) {
|
||||||
|
case string:
|
||||||
|
if n, err := strconv.Atoi(v); err == nil {
|
||||||
|
priority = n
|
||||||
|
}
|
||||||
|
case float64:
|
||||||
|
priority = int(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
msg := ""
|
||||||
|
if m, ok := raw["MESSAGE"]; ok {
|
||||||
|
if s, ok := m.(string); ok {
|
||||||
|
msg = s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ts := ""
|
||||||
|
if t, ok := raw["__REALTIME_TIMESTAMP"]; ok {
|
||||||
|
if s, ok := t.(string); ok {
|
||||||
|
ts = s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unitName := ""
|
||||||
|
if u, ok := raw["_SYSTEMD_UNIT"]; ok {
|
||||||
|
if s, ok := u.(string); ok {
|
||||||
|
unitName = s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
entries = append(entries, LogEntry{
|
||||||
|
Timestamp: ts,
|
||||||
|
Priority: priority,
|
||||||
|
Message: msg,
|
||||||
|
Unit: unitName,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := cmd.Wait(); err != nil {
|
||||||
|
// journalctl returns exit code 1 when --grep matches nothing
|
||||||
|
if len(entries) == 0 {
|
||||||
|
cclog.Debugf("journalctl exited with: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rw.Header().Set("Content-Type", "application/json")
|
||||||
|
if err := json.NewEncoder(rw).Encode(entries); err != nil {
|
||||||
|
cclog.Errorf("Failed to encode log entries: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
151
internal/api/metricstore.go
Normal file
151
internal/api/metricstore.go
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package api
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
|
||||||
|
)
|
||||||
|
|
||||||
|
// handleFree godoc
|
||||||
|
// @summary
|
||||||
|
// @tags free
|
||||||
|
// @description This endpoint allows the users to free the Buffers from the
|
||||||
|
// metric store. This endpoint offers the users to remove then systematically
|
||||||
|
// and also allows then to prune the data under node, if they do not want to
|
||||||
|
// remove the whole node.
|
||||||
|
// @produce json
|
||||||
|
// @param to query string false "up to timestamp"
|
||||||
|
// @success 200 {string} string "ok"
|
||||||
|
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||||
|
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||||
|
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||||
|
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||||
|
// @security ApiKeyAuth
|
||||||
|
// @router /free/ [post]
|
||||||
|
func freeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
rawTo := r.URL.Query().Get("to")
|
||||||
|
if rawTo == "" {
|
||||||
|
handleError(errors.New("'to' is a required query parameter"), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
to, err := strconv.ParseInt(rawTo, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
handleError(err, http.StatusInternalServerError, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
bodyDec := json.NewDecoder(r.Body)
|
||||||
|
var selectors [][]string
|
||||||
|
err = bodyDec.Decode(&selectors)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ms := metricstore.GetMemoryStore()
|
||||||
|
n := 0
|
||||||
|
for _, sel := range selectors {
|
||||||
|
bn, err := ms.Free(sel, to)
|
||||||
|
if err != nil {
|
||||||
|
handleError(err, http.StatusInternalServerError, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
n += bn
|
||||||
|
}
|
||||||
|
|
||||||
|
rw.WriteHeader(http.StatusOK)
|
||||||
|
fmt.Fprintf(rw, "buffers freed: %d\n", n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleWrite godoc
|
||||||
|
// @summary Receive metrics in InfluxDB line-protocol
|
||||||
|
// @tags write
|
||||||
|
// @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)
|
||||||
|
|
||||||
|
// @accept plain
|
||||||
|
// @produce json
|
||||||
|
// @param cluster query string false "If the lines in the body do not have a cluster tag, use this value instead."
|
||||||
|
// @success 200 {string} string "ok"
|
||||||
|
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||||
|
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||||
|
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||||
|
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||||
|
// @security ApiKeyAuth
|
||||||
|
// @router /write/ [post]
|
||||||
|
func writeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
|
|
||||||
|
// Extract the "cluster" query parameter without allocating a url.Values map.
|
||||||
|
cluster := queryParam(r.URL.RawQuery, "cluster")
|
||||||
|
|
||||||
|
// Stream directly from the request body instead of copying it into a
|
||||||
|
// temporary buffer via io.ReadAll. The line-protocol decoder supports
|
||||||
|
// io.Reader natively, so this avoids the largest heap allocation.
|
||||||
|
ms := metricstore.GetMemoryStore()
|
||||||
|
dec := lineprotocol.NewDecoder(r.Body)
|
||||||
|
if err := metricstore.DecodeLine(dec, ms, cluster); err != nil {
|
||||||
|
cclog.Errorf("/api/write error: %s", err.Error())
|
||||||
|
handleError(err, http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
rw.WriteHeader(http.StatusOK)
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryParam extracts a single query-parameter value from a raw query string
|
||||||
|
// without allocating a url.Values map. Returns "" if the key is not present.
|
||||||
|
func queryParam(raw, key string) string {
|
||||||
|
for raw != "" {
|
||||||
|
var kv string
|
||||||
|
kv, raw, _ = strings.Cut(raw, "&")
|
||||||
|
k, v, _ := strings.Cut(kv, "=")
|
||||||
|
if k == key {
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleDebug godoc
|
||||||
|
// @summary Debug endpoint
|
||||||
|
// @tags debug
|
||||||
|
// @description This endpoint allows the users to print the content of
|
||||||
|
// nodes/clusters/metrics to review the state of the data.
|
||||||
|
// @produce json
|
||||||
|
// @param selector query string false "Selector"
|
||||||
|
// @success 200 {string} string "Debug dump"
|
||||||
|
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||||
|
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||||
|
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||||
|
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||||
|
// @security ApiKeyAuth
|
||||||
|
// @router /debug/ [post]
|
||||||
|
func debugMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
raw := r.URL.Query().Get("selector")
|
||||||
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
|
selector := []string{}
|
||||||
|
if len(raw) != 0 {
|
||||||
|
selector = strings.Split(raw, ":")
|
||||||
|
}
|
||||||
|
|
||||||
|
ms := metricstore.GetMemoryStore()
|
||||||
|
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
|
||||||
|
handleError(err, http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
400
internal/api/nats.go
Normal file
400
internal/api/nats.go
Normal file
@@ -0,0 +1,400 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package api
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/nats"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/receivers"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
influx "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NatsAPI provides NATS subscription-based handlers for Job and Node operations.
|
||||||
|
// It mirrors the functionality of the REST API but uses NATS messaging with
|
||||||
|
// InfluxDB line protocol as the message format.
|
||||||
|
//
|
||||||
|
// # Message Format
|
||||||
|
//
|
||||||
|
// All NATS messages use InfluxDB line protocol format (https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/)
|
||||||
|
// with the following structure:
|
||||||
|
//
|
||||||
|
// measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp
|
||||||
|
//
|
||||||
|
// # Job Events
|
||||||
|
//
|
||||||
|
// Job start/stop events use the "job" measurement with a "function" tag to distinguish operations:
|
||||||
|
//
|
||||||
|
// job,function=start_job event="{...JSON payload...}" <timestamp>
|
||||||
|
// job,function=stop_job event="{...JSON payload...}" <timestamp>
|
||||||
|
//
|
||||||
|
// The JSON payload in the "event" field follows the schema.Job or StopJobAPIRequest structure.
|
||||||
|
//
|
||||||
|
// Example job start message:
|
||||||
|
//
|
||||||
|
// job,function=start_job event="{\"jobId\":1001,\"user\":\"testuser\",\"cluster\":\"testcluster\",...}" 1234567890000000000
|
||||||
|
//
|
||||||
|
// # Node State Events
|
||||||
|
//
|
||||||
|
// Node state updates use the "nodestate" measurement with cluster information:
|
||||||
|
//
|
||||||
|
// nodestate event="{...JSON payload...}" <timestamp>
|
||||||
|
//
|
||||||
|
// The JSON payload follows the UpdateNodeStatesRequest structure.
|
||||||
|
//
|
||||||
|
// Example node state message:
|
||||||
|
//
|
||||||
|
// nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node01\",\"states\":[\"idle\"]}]}" 1234567890000000000
|
||||||
|
type NatsAPI struct {
|
||||||
|
JobRepository *repository.JobRepository
|
||||||
|
// RepositoryMutex protects job creation operations from race conditions
|
||||||
|
// when checking for duplicate jobs during startJob calls.
|
||||||
|
RepositoryMutex sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewNatsAPI creates a new NatsAPI instance with default dependencies.
|
||||||
|
func NewNatsAPI() *NatsAPI {
|
||||||
|
return &NatsAPI{
|
||||||
|
JobRepository: repository.GetJobRepository(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// StartSubscriptions registers all NATS subscriptions for Job and Node APIs.
|
||||||
|
// Returns an error if the NATS client is not available or subscription fails.
|
||||||
|
func (api *NatsAPI) StartSubscriptions() error {
|
||||||
|
client := nats.GetClient()
|
||||||
|
if client == nil {
|
||||||
|
cclog.Warn("NATS client not available, skipping API subscriptions")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.Keys.APISubjects != nil {
|
||||||
|
|
||||||
|
s := config.Keys.APISubjects
|
||||||
|
|
||||||
|
if err := client.Subscribe(s.SubjectJobEvent, api.handleJobEvent); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := client.Subscribe(s.SubjectNodeState, api.handleNodeState); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
cclog.Info("NATS API subscriptions started")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// processJobEvent routes job event messages to the appropriate handler based on the "function" tag.
|
||||||
|
// Validates that required tags and fields are present before processing.
|
||||||
|
func (api *NatsAPI) processJobEvent(msg lp.CCMessage) {
|
||||||
|
function, ok := msg.GetTag("function")
|
||||||
|
if !ok {
|
||||||
|
cclog.Errorf("Job event is missing required tag 'function': measurement=%s", msg.Name())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
switch function {
|
||||||
|
case "start_job":
|
||||||
|
v, ok := msg.GetEventValue()
|
||||||
|
if !ok {
|
||||||
|
cclog.Errorf("Job start event is missing event field with JSON payload")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
api.handleStartJob(v)
|
||||||
|
|
||||||
|
case "stop_job":
|
||||||
|
v, ok := msg.GetEventValue()
|
||||||
|
if !ok {
|
||||||
|
cclog.Errorf("Job stop event is missing event field with JSON payload")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
api.handleStopJob(v)
|
||||||
|
|
||||||
|
default:
|
||||||
|
cclog.Warnf("Unknown job event function '%s', expected 'start_job' or 'stop_job'", function)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleJobEvent processes job-related messages received via NATS using InfluxDB line protocol.
|
||||||
|
// The message must be in line protocol format with measurement="job" and include:
|
||||||
|
// - tag "function" with value "start_job" or "stop_job"
|
||||||
|
// - field "event" containing JSON payload (schema.Job or StopJobAPIRequest)
|
||||||
|
//
|
||||||
|
// Example: job,function=start_job event="{\"jobId\":1001,...}" 1234567890000000000
|
||||||
|
func (api *NatsAPI) handleJobEvent(subject string, data []byte) {
|
||||||
|
if len(data) == 0 {
|
||||||
|
cclog.Warnf("NATS %s: received empty message", subject)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
d := influx.NewDecoderWithBytes(data)
|
||||||
|
|
||||||
|
for d.Next() {
|
||||||
|
m, err := receivers.DecodeInfluxMessage(d)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !m.IsEvent() {
|
||||||
|
cclog.Debugf("NATS %s: received non-event message, skipping", subject)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.Name() == "job" {
|
||||||
|
api.processJobEvent(m)
|
||||||
|
} else {
|
||||||
|
cclog.Debugf("NATS %s: unexpected measurement name '%s', expected 'job'", subject, m.Name())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleStartJob processes job start messages received via NATS.
|
||||||
|
// The payload parameter contains JSON following the schema.Job structure.
|
||||||
|
// Jobs are validated, checked for duplicates, and inserted into the database.
|
||||||
|
func (api *NatsAPI) handleStartJob(payload string) {
|
||||||
|
if payload == "" {
|
||||||
|
cclog.Error("NATS start job: payload is empty")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
req := schema.Job{
|
||||||
|
Shared: "none",
|
||||||
|
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
|
||||||
|
}
|
||||||
|
|
||||||
|
dec := json.NewDecoder(strings.NewReader(payload))
|
||||||
|
dec.DisallowUnknownFields()
|
||||||
|
if err := dec.Decode(&req); err != nil {
|
||||||
|
cclog.Errorf("NATS start job: parsing request failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cclog.Debugf("NATS start job: %s", req.GoString())
|
||||||
|
req.State = schema.JobStateRunning
|
||||||
|
|
||||||
|
if err := importer.SanityChecks(&req); err != nil {
|
||||||
|
cclog.Errorf("NATS start job: sanity check failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var unlockOnce sync.Once
|
||||||
|
api.RepositoryMutex.Lock()
|
||||||
|
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||||
|
|
||||||
|
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
|
||||||
|
if err != nil && err != sql.ErrNoRows {
|
||||||
|
cclog.Errorf("NATS start job: checking for duplicate failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err == nil {
|
||||||
|
for _, job := range jobs {
|
||||||
|
if (req.StartTime - job.StartTime) < secondsPerDay {
|
||||||
|
cclog.Errorf("NATS start job: job with jobId %d, cluster %s already exists (dbid: %d)",
|
||||||
|
req.JobID, req.Cluster, job.ID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// When tags are present, insert directly into the job table so that the
|
||||||
|
// returned ID can be used with AddTagOrCreate (which queries the job table).
|
||||||
|
var id int64
|
||||||
|
if len(req.Tags) > 0 {
|
||||||
|
id, err = api.JobRepository.StartDirect(&req)
|
||||||
|
} else {
|
||||||
|
id, err = api.JobRepository.Start(&req)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
cclog.Errorf("NATS start job: insert into database failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||||
|
|
||||||
|
for _, tag := range req.Tags {
|
||||||
|
if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||||
|
cclog.Errorf("NATS start job: adding tag to new job %d failed: %v", id, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cclog.Infof("NATS: new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
|
||||||
|
id, req.Cluster, req.JobID, req.User, req.StartTime)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleStopJob processes job stop messages received via NATS.
|
||||||
|
// The payload parameter contains JSON following the StopJobAPIRequest structure.
|
||||||
|
// The job is marked as stopped in the database and archiving is triggered if monitoring is enabled.
|
||||||
|
func (api *NatsAPI) handleStopJob(payload string) {
|
||||||
|
if payload == "" {
|
||||||
|
cclog.Error("NATS stop job: payload is empty")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req StopJobAPIRequest
|
||||||
|
|
||||||
|
dec := json.NewDecoder(strings.NewReader(payload))
|
||||||
|
dec.DisallowUnknownFields()
|
||||||
|
if err := dec.Decode(&req); err != nil {
|
||||||
|
cclog.Errorf("NATS job stop: parsing request failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if req.JobID == nil {
|
||||||
|
cclog.Errorf("NATS job stop: the field 'jobId' is required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
isCached := false
|
||||||
|
job, err := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
|
||||||
|
if err != nil {
|
||||||
|
// Not in cache, try main job table
|
||||||
|
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Errorf("NATS job stop: finding job failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
isCached = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if job.State != schema.JobStateRunning {
|
||||||
|
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
|
||||||
|
job.JobID, job.ID, job.Cluster, job.State)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if job.StartTime > req.StopTime {
|
||||||
|
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
|
||||||
|
job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if req.State != "" && !req.State.Valid() {
|
||||||
|
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: invalid job state: %#v",
|
||||||
|
job.JobID, job.ID, job.Cluster, req.State)
|
||||||
|
return
|
||||||
|
} else if req.State == "" {
|
||||||
|
req.State = schema.JobStateCompleted
|
||||||
|
}
|
||||||
|
|
||||||
|
job.Duration = int32(req.StopTime - job.StartTime)
|
||||||
|
job.State = req.State
|
||||||
|
api.JobRepository.Mutex.Lock()
|
||||||
|
defer api.JobRepository.Mutex.Unlock()
|
||||||
|
|
||||||
|
// If the job is still in job_cache, transfer it to the job table first
|
||||||
|
if isCached {
|
||||||
|
newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v",
|
||||||
|
job.JobID, *job.ID, job.Cluster, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
|
||||||
|
job.ID = &newID
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||||
|
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
|
||||||
|
job.JobID, *job.ID, job.Cluster, job.State, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s",
|
||||||
|
*job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||||
|
|
||||||
|
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
archiver.TriggerArchiving(job)
|
||||||
|
}
|
||||||
|
|
||||||
|
// processNodestateEvent extracts and processes node state data from the InfluxDB message.
|
||||||
|
// Updates node states in the repository for all nodes in the payload.
|
||||||
|
func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
|
||||||
|
v, ok := msg.GetEventValue()
|
||||||
|
if !ok {
|
||||||
|
cclog.Errorf("Nodestate event is missing event field with JSON payload")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var req UpdateNodeStatesRequest
|
||||||
|
|
||||||
|
dec := json.NewDecoder(strings.NewReader(v))
|
||||||
|
dec.DisallowUnknownFields()
|
||||||
|
if err := dec.Decode(&req); err != nil {
|
||||||
|
cclog.Errorf("NATS nodestate: parsing request failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
repo := repository.GetNodeRepository()
|
||||||
|
requestReceived := time.Now().Unix()
|
||||||
|
|
||||||
|
for _, node := range req.Nodes {
|
||||||
|
state := determineState(node.States)
|
||||||
|
nodeState := schema.NodeStateDB{
|
||||||
|
TimeStamp: requestReceived,
|
||||||
|
NodeState: state,
|
||||||
|
CpusAllocated: node.CpusAllocated,
|
||||||
|
MemoryAllocated: node.MemoryAllocated,
|
||||||
|
GpusAllocated: node.GpusAllocated,
|
||||||
|
HealthState: schema.MonitoringStateFull,
|
||||||
|
JobsRunning: node.JobsRunning,
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
|
||||||
|
cclog.Errorf("NATS nodestate: updating node state for %s on %s failed: %v",
|
||||||
|
node.Hostname, req.Cluster, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cclog.Debugf("NATS nodestate: updated %d node states for cluster %s", len(req.Nodes), req.Cluster)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleNodeState processes node state update messages received via NATS using InfluxDB line protocol.
|
||||||
|
// The message must be in line protocol format with measurement="nodestate" and include:
|
||||||
|
// - field "event" containing JSON payload (UpdateNodeStatesRequest)
|
||||||
|
//
|
||||||
|
// Example: nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[...]}" 1234567890000000000
|
||||||
|
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
|
||||||
|
if len(data) == 0 {
|
||||||
|
cclog.Warnf("NATS %s: received empty message", subject)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
d := influx.NewDecoderWithBytes(data)
|
||||||
|
|
||||||
|
for d.Next() {
|
||||||
|
m, err := receivers.DecodeInfluxMessage(d)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !m.IsEvent() {
|
||||||
|
cclog.Warnf("NATS %s: received non-event message, skipping", subject)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.Name() == "nodestate" {
|
||||||
|
api.processNodestateEvent(m)
|
||||||
|
} else {
|
||||||
|
cclog.Warnf("NATS %s: unexpected measurement name '%s', expected 'nodestate'", subject, m.Name())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
947
internal/api/nats_test.go
Normal file
947
internal/api/nats_test.go
Normal file
@@ -0,0 +1,947 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
package api
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||||
|
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
|
||||||
|
_ "github.com/mattn/go-sqlite3"
|
||||||
|
)
|
||||||
|
|
||||||
|
func setupNatsTest(t *testing.T) *NatsAPI {
|
||||||
|
repository.ResetConnection()
|
||||||
|
|
||||||
|
const testconfig = `{
|
||||||
|
"main": {
|
||||||
|
"addr": "0.0.0.0:8080",
|
||||||
|
"validate": false,
|
||||||
|
"api-allowed-ips": [
|
||||||
|
"*"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"archive": {
|
||||||
|
"kind": "file",
|
||||||
|
"path": "./var/job-archive"
|
||||||
|
},
|
||||||
|
"auth": {
|
||||||
|
"jwts": {
|
||||||
|
"max-age": "2m"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`
|
||||||
|
const testclusterJSON = `{
|
||||||
|
"name": "testcluster",
|
||||||
|
"subClusters": [
|
||||||
|
{
|
||||||
|
"name": "sc1",
|
||||||
|
"nodes": "host123,host124,host125",
|
||||||
|
"processorType": "Intel Core i7-4770",
|
||||||
|
"socketsPerNode": 1,
|
||||||
|
"coresPerSocket": 4,
|
||||||
|
"threadsPerCore": 2,
|
||||||
|
"flopRateScalar": {
|
||||||
|
"unit": {
|
||||||
|
"prefix": "G",
|
||||||
|
"base": "F/s"
|
||||||
|
},
|
||||||
|
"value": 14
|
||||||
|
},
|
||||||
|
"flopRateSimd": {
|
||||||
|
"unit": {
|
||||||
|
"prefix": "G",
|
||||||
|
"base": "F/s"
|
||||||
|
},
|
||||||
|
"value": 112
|
||||||
|
},
|
||||||
|
"memoryBandwidth": {
|
||||||
|
"unit": {
|
||||||
|
"prefix": "G",
|
||||||
|
"base": "B/s"
|
||||||
|
},
|
||||||
|
"value": 24
|
||||||
|
},
|
||||||
|
"numberOfNodes": 70,
|
||||||
|
"topology": {
|
||||||
|
"node": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||||
|
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||||
|
"memoryDomain": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||||
|
"die": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||||
|
"core": [[0], [1], [2], [3], [4], [5], [6], [7]]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metricConfig": [
|
||||||
|
{
|
||||||
|
"name": "load_one",
|
||||||
|
"unit": { "base": ""},
|
||||||
|
"scope": "node",
|
||||||
|
"timestep": 60,
|
||||||
|
"aggregation": "avg",
|
||||||
|
"peak": 8,
|
||||||
|
"normal": 0,
|
||||||
|
"caution": 0,
|
||||||
|
"alert": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}`
|
||||||
|
|
||||||
|
cclog.Init("info", true)
|
||||||
|
tmpdir := t.TempDir()
|
||||||
|
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||||
|
if err := os.Mkdir(jobarchive, 0o777); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||||
|
err := repository.MigrateDB(dbfilepath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
||||||
|
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
ccconf.Init(cfgFilePath)
|
||||||
|
|
||||||
|
// Load and check main configuration
|
||||||
|
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||||
|
config.Init(cfg)
|
||||||
|
} else {
|
||||||
|
cclog.Abort("Main configuration must be present")
|
||||||
|
}
|
||||||
|
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||||
|
|
||||||
|
repository.Connect(dbfilepath)
|
||||||
|
|
||||||
|
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// metricstore initialization removed - it's initialized via callback in tests
|
||||||
|
|
||||||
|
archiver.Start(repository.GetJobRepository(), context.Background())
|
||||||
|
|
||||||
|
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||||
|
auth.Init(&cfg)
|
||||||
|
} else {
|
||||||
|
cclog.Warn("Authentication disabled due to missing configuration")
|
||||||
|
auth.Init(nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
graph.Init()
|
||||||
|
|
||||||
|
return NewNatsAPI()
|
||||||
|
}
|
||||||
|
|
||||||
|
func cleanupNatsTest() {
|
||||||
|
if err := archiver.Shutdown(5 * time.Second); err != nil {
|
||||||
|
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNatsHandleStartJob(t *testing.T) {
|
||||||
|
natsAPI := setupNatsTest(t)
|
||||||
|
t.Cleanup(cleanupNatsTest)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
payload string
|
||||||
|
expectError bool
|
||||||
|
validateJob func(t *testing.T, job *schema.Job)
|
||||||
|
shouldFindJob bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "valid job start",
|
||||||
|
payload: `{
|
||||||
|
"jobId": 1001,
|
||||||
|
"user": "testuser1",
|
||||||
|
"project": "testproj1",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 7200,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [
|
||||||
|
{
|
||||||
|
"hostname": "host123",
|
||||||
|
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"startTime": 1234567890
|
||||||
|
}`,
|
||||||
|
expectError: false,
|
||||||
|
shouldFindJob: true,
|
||||||
|
validateJob: func(t *testing.T, job *schema.Job) {
|
||||||
|
if job.JobID != 1001 {
|
||||||
|
t.Errorf("expected JobID 1001, got %d", job.JobID)
|
||||||
|
}
|
||||||
|
if job.User != "testuser1" {
|
||||||
|
t.Errorf("expected user testuser1, got %s", job.User)
|
||||||
|
}
|
||||||
|
if job.State != schema.JobStateRunning {
|
||||||
|
t.Errorf("expected state running, got %s", job.State)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid JSON",
|
||||||
|
payload: `{
|
||||||
|
"jobId": "not a number",
|
||||||
|
"user": "testuser2"
|
||||||
|
}`,
|
||||||
|
expectError: true,
|
||||||
|
shouldFindJob: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing required fields",
|
||||||
|
payload: `{
|
||||||
|
"jobId": 1002
|
||||||
|
}`,
|
||||||
|
expectError: true,
|
||||||
|
shouldFindJob: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "job with unknown fields (should fail due to DisallowUnknownFields)",
|
||||||
|
payload: `{
|
||||||
|
"jobId": 1003,
|
||||||
|
"user": "testuser3",
|
||||||
|
"project": "testproj3",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"unknownField": "should cause error",
|
||||||
|
"startTime": 1234567900
|
||||||
|
}`,
|
||||||
|
expectError: true,
|
||||||
|
shouldFindJob: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "job with tags",
|
||||||
|
payload: `{
|
||||||
|
"jobId": 1004,
|
||||||
|
"user": "testuser4",
|
||||||
|
"project": "testproj4",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [
|
||||||
|
{
|
||||||
|
"hostname": "host123",
|
||||||
|
"hwthreads": [0, 1, 2, 3]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"tags": [
|
||||||
|
{
|
||||||
|
"type": "test",
|
||||||
|
"name": "testtag",
|
||||||
|
"scope": "testuser4"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"startTime": 1234567910
|
||||||
|
}`,
|
||||||
|
expectError: false,
|
||||||
|
shouldFindJob: true,
|
||||||
|
validateJob: func(t *testing.T, job *schema.Job) {
|
||||||
|
if job.JobID != 1004 {
|
||||||
|
t.Errorf("expected JobID 1004, got %d", job.JobID)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
natsAPI.handleStartJob(tt.payload)
|
||||||
|
natsAPI.JobRepository.SyncJobs()
|
||||||
|
|
||||||
|
// Allow some time for async operations
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
|
||||||
|
if tt.shouldFindJob {
|
||||||
|
// Extract jobId from payload
|
||||||
|
var payloadMap map[string]any
|
||||||
|
json.Unmarshal([]byte(tt.payload), &payloadMap)
|
||||||
|
jobID := int64(payloadMap["jobId"].(float64))
|
||||||
|
cluster := payloadMap["cluster"].(string)
|
||||||
|
startTime := int64(payloadMap["startTime"].(float64))
|
||||||
|
|
||||||
|
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, &startTime)
|
||||||
|
if err != nil {
|
||||||
|
if !tt.expectError {
|
||||||
|
t.Fatalf("expected to find job, but got error: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.validateJob != nil {
|
||||||
|
tt.validateJob(t, job)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNatsHandleStopJob(t *testing.T) {
|
||||||
|
natsAPI := setupNatsTest(t)
|
||||||
|
t.Cleanup(cleanupNatsTest)
|
||||||
|
|
||||||
|
// First, create a running job
|
||||||
|
startPayload := `{
|
||||||
|
"jobId": 2001,
|
||||||
|
"user": "testuser",
|
||||||
|
"project": "testproj",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [
|
||||||
|
{
|
||||||
|
"hostname": "host123",
|
||||||
|
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"startTime": 1234567890
|
||||||
|
}`
|
||||||
|
|
||||||
|
natsAPI.handleStartJob(startPayload)
|
||||||
|
natsAPI.JobRepository.SyncJobs()
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
payload string
|
||||||
|
expectError bool
|
||||||
|
validateJob func(t *testing.T, job *schema.Job)
|
||||||
|
setupJobFunc func() // Optional: create specific test job
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "valid job stop - completed",
|
||||||
|
payload: `{
|
||||||
|
"jobId": 2001,
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"startTime": 1234567890,
|
||||||
|
"jobState": "completed",
|
||||||
|
"stopTime": 1234571490
|
||||||
|
}`,
|
||||||
|
expectError: false,
|
||||||
|
validateJob: func(t *testing.T, job *schema.Job) {
|
||||||
|
if job.State != schema.JobStateCompleted {
|
||||||
|
t.Errorf("expected state completed, got %s", job.State)
|
||||||
|
}
|
||||||
|
expectedDuration := int32(1234571490 - 1234567890)
|
||||||
|
if job.Duration != expectedDuration {
|
||||||
|
t.Errorf("expected duration %d, got %d", expectedDuration, job.Duration)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "valid job stop - failed",
|
||||||
|
setupJobFunc: func() {
|
||||||
|
startPayloadFailed := `{
|
||||||
|
"jobId": 2002,
|
||||||
|
"user": "testuser",
|
||||||
|
"project": "testproj",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [
|
||||||
|
{
|
||||||
|
"hostname": "host123",
|
||||||
|
"hwthreads": [0, 1, 2, 3]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"startTime": 1234567900
|
||||||
|
}`
|
||||||
|
natsAPI.handleStartJob(startPayloadFailed)
|
||||||
|
natsAPI.JobRepository.SyncJobs()
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
},
|
||||||
|
payload: `{
|
||||||
|
"jobId": 2002,
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"startTime": 1234567900,
|
||||||
|
"jobState": "failed",
|
||||||
|
"stopTime": 1234569900
|
||||||
|
}`,
|
||||||
|
expectError: false,
|
||||||
|
validateJob: func(t *testing.T, job *schema.Job) {
|
||||||
|
if job.State != schema.JobStateFailed {
|
||||||
|
t.Errorf("expected state failed, got %s", job.State)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid JSON",
|
||||||
|
payload: `{
|
||||||
|
"jobId": "not a number"
|
||||||
|
}`,
|
||||||
|
expectError: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing jobId",
|
||||||
|
payload: `{
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"jobState": "completed",
|
||||||
|
"stopTime": 1234571490
|
||||||
|
}`,
|
||||||
|
expectError: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid job state",
|
||||||
|
setupJobFunc: func() {
|
||||||
|
startPayloadInvalid := `{
|
||||||
|
"jobId": 2003,
|
||||||
|
"user": "testuser",
|
||||||
|
"project": "testproj",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [
|
||||||
|
{
|
||||||
|
"hostname": "host123",
|
||||||
|
"hwthreads": [0, 1]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"startTime": 1234567910
|
||||||
|
}`
|
||||||
|
natsAPI.handleStartJob(startPayloadInvalid)
|
||||||
|
natsAPI.JobRepository.SyncJobs()
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
},
|
||||||
|
payload: `{
|
||||||
|
"jobId": 2003,
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"startTime": 1234567910,
|
||||||
|
"jobState": "invalid_state",
|
||||||
|
"stopTime": 1234571510
|
||||||
|
}`,
|
||||||
|
expectError: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "stopTime before startTime",
|
||||||
|
setupJobFunc: func() {
|
||||||
|
startPayloadTime := `{
|
||||||
|
"jobId": 2004,
|
||||||
|
"user": "testuser",
|
||||||
|
"project": "testproj",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [
|
||||||
|
{
|
||||||
|
"hostname": "host123",
|
||||||
|
"hwthreads": [0]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"startTime": 1234567920
|
||||||
|
}`
|
||||||
|
natsAPI.handleStartJob(startPayloadTime)
|
||||||
|
natsAPI.JobRepository.SyncJobs()
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
},
|
||||||
|
payload: `{
|
||||||
|
"jobId": 2004,
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"startTime": 1234567920,
|
||||||
|
"jobState": "completed",
|
||||||
|
"stopTime": 1234567900
|
||||||
|
}`,
|
||||||
|
expectError: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "job not found",
|
||||||
|
payload: `{
|
||||||
|
"jobId": 99999,
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"startTime": 1234567890,
|
||||||
|
"jobState": "completed",
|
||||||
|
"stopTime": 1234571490
|
||||||
|
}`,
|
||||||
|
expectError: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
testData := schema.JobData{
|
||||||
|
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||||
|
schema.MetricScopeNode: {
|
||||||
|
Unit: schema.Unit{Base: "load"},
|
||||||
|
Timestep: 60,
|
||||||
|
Series: []schema.Series{
|
||||||
|
{
|
||||||
|
Hostname: "host123",
|
||||||
|
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
|
||||||
|
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||||
|
return testData, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if tt.setupJobFunc != nil {
|
||||||
|
tt.setupJobFunc()
|
||||||
|
}
|
||||||
|
|
||||||
|
natsAPI.handleStopJob(tt.payload)
|
||||||
|
|
||||||
|
// Allow some time for async operations
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
|
||||||
|
if !tt.expectError && tt.validateJob != nil {
|
||||||
|
// Extract job details from payload
|
||||||
|
var payloadMap map[string]any
|
||||||
|
json.Unmarshal([]byte(tt.payload), &payloadMap)
|
||||||
|
jobID := int64(payloadMap["jobId"].(float64))
|
||||||
|
cluster := payloadMap["cluster"].(string)
|
||||||
|
|
||||||
|
var startTime *int64
|
||||||
|
if st, ok := payloadMap["startTime"]; ok {
|
||||||
|
t := int64(st.(float64))
|
||||||
|
startTime = &t
|
||||||
|
}
|
||||||
|
|
||||||
|
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, startTime)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("expected to find job, but got error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
tt.validateJob(t, job)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNatsHandleNodeState(t *testing.T) {
|
||||||
|
natsAPI := setupNatsTest(t)
|
||||||
|
t.Cleanup(cleanupNatsTest)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
data []byte
|
||||||
|
expectError bool
|
||||||
|
validateFn func(t *testing.T)
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "valid node state update",
|
||||||
|
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"allocated\"],\"cpusAllocated\":8,\"memoryAllocated\":16384,\"gpusAllocated\":0,\"jobsRunning\":1}]}" 1234567890000000000`),
|
||||||
|
expectError: false,
|
||||||
|
validateFn: func(t *testing.T) {
|
||||||
|
// In a full test, we would verify the node state was updated in the database
|
||||||
|
// For now, just ensure no error occurred
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multiple nodes",
|
||||||
|
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"host124\",\"states\":[\"allocated\"],\"cpusAllocated\":4,\"memoryAllocated\":8192,\"gpusAllocated\":1,\"jobsRunning\":1}]}" 1234567890000000000`),
|
||||||
|
expectError: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid JSON in event field",
|
||||||
|
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":\"not an array\"}" 1234567890000000000`),
|
||||||
|
expectError: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty nodes array",
|
||||||
|
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1234567890000000000`),
|
||||||
|
expectError: false, // Empty array should not cause error
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid line protocol format",
|
||||||
|
data: []byte(`invalid line protocol format`),
|
||||||
|
expectError: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty data",
|
||||||
|
data: []byte(``),
|
||||||
|
expectError: false, // Should be handled gracefully with warning
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
natsAPI.handleNodeState("test.subject", tt.data)
|
||||||
|
|
||||||
|
// Allow some time for async operations
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
|
||||||
|
if tt.validateFn != nil {
|
||||||
|
tt.validateFn(t)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNatsProcessJobEvent(t *testing.T) {
|
||||||
|
natsAPI := setupNatsTest(t)
|
||||||
|
t.Cleanup(cleanupNatsTest)
|
||||||
|
|
||||||
|
msgStartJob, err := lp.NewMessage(
|
||||||
|
"job",
|
||||||
|
map[string]string{"function": "start_job"},
|
||||||
|
nil,
|
||||||
|
map[string]any{
|
||||||
|
"event": `{
|
||||||
|
"jobId": 3001,
|
||||||
|
"user": "testuser",
|
||||||
|
"project": "testproj",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [
|
||||||
|
{
|
||||||
|
"hostname": "host123",
|
||||||
|
"hwthreads": [0, 1, 2, 3]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"startTime": 1234567890
|
||||||
|
}`,
|
||||||
|
},
|
||||||
|
time.Now(),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to create test message: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
msgMissingTag, err := lp.NewMessage(
|
||||||
|
"job",
|
||||||
|
map[string]string{},
|
||||||
|
nil,
|
||||||
|
map[string]any{
|
||||||
|
"event": `{}`,
|
||||||
|
},
|
||||||
|
time.Now(),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to create test message: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
msgUnknownFunc, err := lp.NewMessage(
|
||||||
|
"job",
|
||||||
|
map[string]string{"function": "unknown_function"},
|
||||||
|
nil,
|
||||||
|
map[string]any{
|
||||||
|
"event": `{}`,
|
||||||
|
},
|
||||||
|
time.Now(),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to create test message: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
message lp.CCMessage
|
||||||
|
expectError bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "start_job function",
|
||||||
|
message: msgStartJob,
|
||||||
|
expectError: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing function tag",
|
||||||
|
message: msgMissingTag,
|
||||||
|
expectError: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unknown function",
|
||||||
|
message: msgUnknownFunc,
|
||||||
|
expectError: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
natsAPI.processJobEvent(tt.message)
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNatsHandleJobEvent(t *testing.T) {
|
||||||
|
natsAPI := setupNatsTest(t)
|
||||||
|
t.Cleanup(cleanupNatsTest)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
data []byte
|
||||||
|
expectError bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "valid influx line protocol",
|
||||||
|
data: []byte(`job,function=start_job event="{\"jobId\":4001,\"user\":\"testuser\",\"project\":\"testproj\",\"cluster\":\"testcluster\",\"partition\":\"main\",\"walltime\":3600,\"numNodes\":1,\"numHwthreads\":8,\"numAcc\":0,\"shared\":\"none\",\"monitoringStatus\":1,\"smt\":1,\"resources\":[{\"hostname\":\"host123\",\"hwthreads\":[0,1,2,3]}],\"startTime\":1234567890}" 1234567890000000000`),
|
||||||
|
expectError: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid influx line protocol",
|
||||||
|
data: []byte(`invalid line protocol format`),
|
||||||
|
expectError: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty data",
|
||||||
|
data: []byte(``),
|
||||||
|
expectError: false, // Decoder should handle empty input gracefully
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
// HandleJobEvent doesn't return errors, it logs them
|
||||||
|
// We're just ensuring it doesn't panic
|
||||||
|
natsAPI.handleJobEvent("test.subject", tt.data)
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNatsHandleJobEventEdgeCases(t *testing.T) {
|
||||||
|
natsAPI := setupNatsTest(t)
|
||||||
|
t.Cleanup(cleanupNatsTest)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
data []byte
|
||||||
|
expectError bool
|
||||||
|
description string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "non-event message (metric data)",
|
||||||
|
data: []byte(`job,function=start_job value=123.45 1234567890000000000`),
|
||||||
|
expectError: false,
|
||||||
|
description: "Should skip non-event messages gracefully",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "wrong measurement name",
|
||||||
|
data: []byte(`wrongmeasurement,function=start_job event="{}" 1234567890000000000`),
|
||||||
|
expectError: false,
|
||||||
|
description: "Should warn about unexpected measurement but not fail",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing event field",
|
||||||
|
data: []byte(`job,function=start_job other_field="value" 1234567890000000000`),
|
||||||
|
expectError: true,
|
||||||
|
description: "Should error when event field is missing",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multiple measurements in one message",
|
||||||
|
data: []byte("job,function=start_job event=\"{}\" 1234567890000000000\njob,function=stop_job event=\"{}\" 1234567890000000000"),
|
||||||
|
expectError: false,
|
||||||
|
description: "Should process multiple lines",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "escaped quotes in JSON payload",
|
||||||
|
data: []byte(`job,function=start_job event="{\"jobId\":6001,\"user\":\"test\\\"user\",\"cluster\":\"test\"}" 1234567890000000000`),
|
||||||
|
expectError: true,
|
||||||
|
description: "Should handle escaped quotes (though JSON parsing may fail)",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
natsAPI.handleJobEvent("test.subject", tt.data)
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNatsHandleNodeStateEdgeCases(t *testing.T) {
|
||||||
|
natsAPI := setupNatsTest(t)
|
||||||
|
t.Cleanup(cleanupNatsTest)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
data []byte
|
||||||
|
expectError bool
|
||||||
|
description string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "missing cluster field in JSON",
|
||||||
|
data: []byte(`nodestate event="{\"nodes\":[]}" 1234567890000000000`),
|
||||||
|
expectError: true,
|
||||||
|
description: "Should fail when cluster is missing",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "malformed JSON with unescaped quotes",
|
||||||
|
data: []byte(`nodestate event="{\"cluster\":\"test"cluster\",\"nodes\":[]}" 1234567890000000000`),
|
||||||
|
expectError: true,
|
||||||
|
description: "Should fail on malformed JSON",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unicode characters in hostname",
|
||||||
|
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host-ñ123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
|
||||||
|
expectError: false,
|
||||||
|
description: "Should handle unicode characters",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "very large node count",
|
||||||
|
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node1\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node2\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node3\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
|
||||||
|
expectError: false,
|
||||||
|
description: "Should handle multiple nodes efficiently",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "timestamp in past",
|
||||||
|
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1000000000000000000`),
|
||||||
|
expectError: false,
|
||||||
|
description: "Should accept any valid timestamp",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
natsAPI.handleNodeState("test.subject", tt.data)
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNatsHandleStartJobDuplicatePrevention(t *testing.T) {
|
||||||
|
natsAPI := setupNatsTest(t)
|
||||||
|
t.Cleanup(cleanupNatsTest)
|
||||||
|
|
||||||
|
// Start a job
|
||||||
|
payload := `{
|
||||||
|
"jobId": 5001,
|
||||||
|
"user": "testuser",
|
||||||
|
"project": "testproj",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [
|
||||||
|
{
|
||||||
|
"hostname": "host123",
|
||||||
|
"hwthreads": [0, 1, 2, 3]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"startTime": 1234567890
|
||||||
|
}`
|
||||||
|
|
||||||
|
natsAPI.handleStartJob(payload)
|
||||||
|
natsAPI.JobRepository.SyncJobs()
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
|
||||||
|
// Try to start the same job again (within 24 hours)
|
||||||
|
duplicatePayload := `{
|
||||||
|
"jobId": 5001,
|
||||||
|
"user": "testuser",
|
||||||
|
"project": "testproj",
|
||||||
|
"cluster": "testcluster",
|
||||||
|
"partition": "main",
|
||||||
|
"walltime": 3600,
|
||||||
|
"numNodes": 1,
|
||||||
|
"numHwthreads": 8,
|
||||||
|
"numAcc": 0,
|
||||||
|
"shared": "none",
|
||||||
|
"monitoringStatus": 1,
|
||||||
|
"smt": 1,
|
||||||
|
"resources": [
|
||||||
|
{
|
||||||
|
"hostname": "host123",
|
||||||
|
"hwthreads": [0, 1, 2, 3]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"startTime": 1234567900
|
||||||
|
}`
|
||||||
|
|
||||||
|
natsAPI.handleStartJob(duplicatePayload)
|
||||||
|
natsAPI.JobRepository.SyncJobs()
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
|
||||||
|
// Verify only one job exists
|
||||||
|
jobID := int64(5001)
|
||||||
|
cluster := "testcluster"
|
||||||
|
jobs, err := natsAPI.JobRepository.FindAll(&jobID, &cluster, nil)
|
||||||
|
if err != nil && err != sql.ErrNoRows {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(jobs) != 1 {
|
||||||
|
t.Errorf("expected 1 job, got %d", len(jobs))
|
||||||
|
}
|
||||||
|
}
|
||||||
145
internal/api/node.go
Normal file
145
internal/api/node.go
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package api
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"maps"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
type UpdateNodeStatesRequest struct {
|
||||||
|
Nodes []schema.NodePayload `json:"nodes"`
|
||||||
|
Cluster string `json:"cluster" example:"fritz"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// metricListToNames converts a map of metric configurations to a list of metric names
|
||||||
|
func metricListToNames(metricList map[string]*schema.Metric) []string {
|
||||||
|
names := make([]string, 0, len(metricList))
|
||||||
|
for name := range metricList {
|
||||||
|
names = append(names, name)
|
||||||
|
}
|
||||||
|
return names
|
||||||
|
}
|
||||||
|
|
||||||
|
// this routine assumes that only one of them exists per node
|
||||||
|
func determineState(states []string) schema.SchedulerState {
|
||||||
|
for _, state := range states {
|
||||||
|
switch strings.ToLower(state) {
|
||||||
|
case "allocated":
|
||||||
|
return schema.NodeStateAllocated
|
||||||
|
case "reserved":
|
||||||
|
return schema.NodeStateReserved
|
||||||
|
case "idle":
|
||||||
|
return schema.NodeStateIdle
|
||||||
|
case "down":
|
||||||
|
return schema.NodeStateDown
|
||||||
|
case "mixed":
|
||||||
|
return schema.NodeStateMixed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return schema.NodeStateUnknown
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateNodeStates godoc
|
||||||
|
// @summary Deliver updated Slurm node states
|
||||||
|
// @tags Nodestates
|
||||||
|
// @description Returns a JSON-encoded list of users.
|
||||||
|
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||||
|
// @produce json
|
||||||
|
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
|
||||||
|
// @success 200 {object} api.DefaultAPIResponse "Success message"
|
||||||
|
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||||
|
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||||
|
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||||
|
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||||
|
// @security ApiKeyAuth
|
||||||
|
// @router /api/nodestats/ [post]
|
||||||
|
func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
// Parse request body
|
||||||
|
req := UpdateNodeStatesRequest{}
|
||||||
|
if err := decode(r.Body, &req); err != nil {
|
||||||
|
handleError(fmt.Errorf("parsing request body failed: %w", err),
|
||||||
|
http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
requestReceived := time.Now().Unix()
|
||||||
|
repo := repository.GetNodeRepository()
|
||||||
|
|
||||||
|
m := make(map[string][]string)
|
||||||
|
metricNames := make(map[string][]string)
|
||||||
|
healthResults := make(map[string]metricstore.HealthCheckResult)
|
||||||
|
|
||||||
|
startMs := time.Now()
|
||||||
|
|
||||||
|
// Step 1: Build nodeList and metricList per subcluster
|
||||||
|
for _, node := range req.Nodes {
|
||||||
|
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||||
|
m[sc] = append(m[sc], node.Hostname)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for sc := range m {
|
||||||
|
if sc != "" {
|
||||||
|
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
|
||||||
|
metricNames[sc] = metricListToNames(metricList)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Determine which metric store to query and perform health check
|
||||||
|
healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
|
||||||
|
} else {
|
||||||
|
for sc, nl := range m {
|
||||||
|
if sc != "" {
|
||||||
|
if results, err := healthRepo.HealthCheck(req.Cluster, nl, metricNames[sc]); err == nil {
|
||||||
|
maps.Copy(healthResults, results)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cclog.Debugf("Timer updateNodeStates, MemStore HealthCheck: %s", time.Since(startMs))
|
||||||
|
startDB := time.Now()
|
||||||
|
|
||||||
|
for _, node := range req.Nodes {
|
||||||
|
state := determineState(node.States)
|
||||||
|
healthState := schema.MonitoringStateFailed
|
||||||
|
var healthMetrics string
|
||||||
|
if result, ok := healthResults[node.Hostname]; ok {
|
||||||
|
healthState = result.State
|
||||||
|
healthMetrics = result.HealthMetrics
|
||||||
|
}
|
||||||
|
nodeState := schema.NodeStateDB{
|
||||||
|
TimeStamp: requestReceived,
|
||||||
|
NodeState: state,
|
||||||
|
CpusAllocated: node.CpusAllocated,
|
||||||
|
MemoryAllocated: node.MemoryAllocated,
|
||||||
|
GpusAllocated: node.GpusAllocated,
|
||||||
|
HealthState: healthState,
|
||||||
|
HealthMetrics: healthMetrics,
|
||||||
|
JobsRunning: node.JobsRunning,
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
|
||||||
|
cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v",
|
||||||
|
node.Hostname, req.Cluster, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB))
|
||||||
|
}
|
||||||
1651
internal/api/rest.go
1651
internal/api/rest.go
File diff suppressed because it is too large
Load Diff
221
internal/api/user.go
Normal file
221
internal/api/user.go
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package api
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
|
)
|
||||||
|
|
||||||
|
type APIReturnedUser struct {
|
||||||
|
Username string `json:"username"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Roles []string `json:"roles"`
|
||||||
|
Email string `json:"email"`
|
||||||
|
Projects []string `json:"projects"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// getUsers godoc
|
||||||
|
// @summary Returns a list of users
|
||||||
|
// @tags User
|
||||||
|
// @description Returns a JSON-encoded list of users.
|
||||||
|
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||||
|
// @produce json
|
||||||
|
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
|
||||||
|
// @success 200 {array} api.APIReturnedUser "List of users returned successfully"
|
||||||
|
// @failure 400 {string} string "Bad Request"
|
||||||
|
// @failure 401 {string} string "Unauthorized"
|
||||||
|
// @failure 403 {string} string "Forbidden"
|
||||||
|
// @failure 500 {string} string "Internal Server Error"
|
||||||
|
// @security ApiKeyAuth
|
||||||
|
// @router /api/users/ [get]
|
||||||
|
func (api *RestAPI) getUsers(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
// SecuredCheck() only worked with TokenAuth: Removed
|
||||||
|
|
||||||
|
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||||
|
handleError(fmt.Errorf("only admins are allowed to fetch a list of users"), http.StatusForbidden, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true")
|
||||||
|
if err != nil {
|
||||||
|
handleError(fmt.Errorf("listing users failed: %w", err), http.StatusInternalServerError, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rw.Header().Set("Content-Type", "application/json")
|
||||||
|
if err := json.NewEncoder(rw).Encode(users); err != nil {
|
||||||
|
cclog.Errorf("Failed to encode users response: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateUser godoc
|
||||||
|
// @summary Update user roles and projects
|
||||||
|
// @tags User
|
||||||
|
// @description Allows admins to add/remove roles and projects for a user
|
||||||
|
// @produce plain
|
||||||
|
// @param id path string true "Username"
|
||||||
|
// @param add-role formData string false "Role to add"
|
||||||
|
// @param remove-role formData string false "Role to remove"
|
||||||
|
// @param add-project formData string false "Project to add"
|
||||||
|
// @param remove-project formData string false "Project to remove"
|
||||||
|
// @success 200 {string} string "Success message"
|
||||||
|
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||||
|
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
|
||||||
|
// @security ApiKeyAuth
|
||||||
|
// @router /api/user/{id} [post]
|
||||||
|
func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
// SecuredCheck() only worked with TokenAuth: Removed
|
||||||
|
|
||||||
|
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||||
|
handleError(fmt.Errorf("only admins are allowed to update a user"), http.StatusForbidden, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get Values
|
||||||
|
newrole := r.FormValue("add-role")
|
||||||
|
delrole := r.FormValue("remove-role")
|
||||||
|
newproj := r.FormValue("add-project")
|
||||||
|
delproj := r.FormValue("remove-project")
|
||||||
|
|
||||||
|
rw.Header().Set("Content-Type", "application/json")
|
||||||
|
|
||||||
|
// Handle role updates
|
||||||
|
if newrole != "" {
|
||||||
|
if err := repository.GetUserRepository().AddRole(r.Context(), chi.URLParam(r, "id"), newrole); err != nil {
|
||||||
|
handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Add Role Success"}); err != nil {
|
||||||
|
cclog.Errorf("Failed to encode response: %v", err)
|
||||||
|
}
|
||||||
|
} else if delrole != "" {
|
||||||
|
if err := repository.GetUserRepository().RemoveRole(r.Context(), chi.URLParam(r, "id"), delrole); err != nil {
|
||||||
|
handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Remove Role Success"}); err != nil {
|
||||||
|
cclog.Errorf("Failed to encode response: %v", err)
|
||||||
|
}
|
||||||
|
} else if newproj != "" {
|
||||||
|
if err := repository.GetUserRepository().AddProject(r.Context(), chi.URLParam(r, "id"), newproj); err != nil {
|
||||||
|
handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Add Project Success"}); err != nil {
|
||||||
|
cclog.Errorf("Failed to encode response: %v", err)
|
||||||
|
}
|
||||||
|
} else if delproj != "" {
|
||||||
|
if err := repository.GetUserRepository().RemoveProject(r.Context(), chi.URLParam(r, "id"), delproj); err != nil {
|
||||||
|
handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Remove Project Success"}); err != nil {
|
||||||
|
cclog.Errorf("Failed to encode response: %v", err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
handleError(fmt.Errorf("no operation specified: must provide add-role, remove-role, add-project, or remove-project"), http.StatusBadRequest, rw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// createUser godoc
|
||||||
|
// @summary Create a new user
|
||||||
|
// @tags User
|
||||||
|
// @description Creates a new user with specified credentials and role
|
||||||
|
// @produce plain
|
||||||
|
// @param username formData string true "Username"
|
||||||
|
// @param password formData string false "Password (not required for API users)"
|
||||||
|
// @param role formData string true "User role"
|
||||||
|
// @param name formData string false "Full name"
|
||||||
|
// @param email formData string false "Email address"
|
||||||
|
// @param project formData string false "Project (required for managers)"
|
||||||
|
// @success 200 {string} string "Success message"
|
||||||
|
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||||
|
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||||
|
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
|
||||||
|
// @security ApiKeyAuth
|
||||||
|
// @router /api/users/ [post]
|
||||||
|
func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
// SecuredCheck() only worked with TokenAuth: Removed
|
||||||
|
|
||||||
|
rw.Header().Set("Content-Type", "text/plain")
|
||||||
|
me := repository.GetUserFromContext(r.Context())
|
||||||
|
if !me.HasRole(schema.RoleAdmin) {
|
||||||
|
handleError(fmt.Errorf("only admins are allowed to create new users"), http.StatusForbidden, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
username, password, role, name, email, project := r.FormValue("username"),
|
||||||
|
r.FormValue("password"), r.FormValue("role"), r.FormValue("name"),
|
||||||
|
r.FormValue("email"), r.FormValue("project")
|
||||||
|
|
||||||
|
// Validate username length
|
||||||
|
if len(username) == 0 || len(username) > 100 {
|
||||||
|
handleError(fmt.Errorf("username must be between 1 and 100 characters"), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(password) == 0 && role != schema.GetRoleString(schema.RoleAPI) {
|
||||||
|
handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) {
|
||||||
|
handleError(fmt.Errorf("only managers require a project (can be changed later)"), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
} else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) {
|
||||||
|
handleError(fmt.Errorf("managers require a project to manage (can be changed later)"), http.StatusBadRequest, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := repository.GetUserRepository().AddUser(&schema.User{
|
||||||
|
Username: username,
|
||||||
|
Name: name,
|
||||||
|
Password: password,
|
||||||
|
Email: email,
|
||||||
|
Projects: []string{project},
|
||||||
|
Roles: []string{role},
|
||||||
|
}); err != nil {
|
||||||
|
handleError(fmt.Errorf("adding user failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(rw, "User %v successfully created!\n", username)
|
||||||
|
}
|
||||||
|
|
||||||
|
// deleteUser godoc
|
||||||
|
// @summary Delete a user
|
||||||
|
// @tags User
|
||||||
|
// @description Deletes a user from the system
|
||||||
|
// @produce plain
|
||||||
|
// @param username formData string true "Username to delete"
|
||||||
|
// @success 200 {string} string "Success"
|
||||||
|
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||||
|
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
|
||||||
|
// @security ApiKeyAuth
|
||||||
|
// @router /api/users/ [delete]
|
||||||
|
func (api *RestAPI) deleteUser(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
// SecuredCheck() only worked with TokenAuth: Removed
|
||||||
|
|
||||||
|
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||||
|
handleError(fmt.Errorf("only admins are allowed to delete a user"), http.StatusForbidden, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
username := r.FormValue("username")
|
||||||
|
if err := repository.GetUserRepository().DelUser(username); err != nil {
|
||||||
|
handleError(fmt.Errorf("deleting user failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rw.WriteHeader(http.StatusOK)
|
||||||
|
}
|
||||||
189
internal/archiver/README.md
Normal file
189
internal/archiver/README.md
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
# Archiver Package
|
||||||
|
|
||||||
|
The `archiver` package provides asynchronous job archiving functionality for ClusterCockpit. When jobs complete, their metric data is archived from the metric store to a persistent archive backend (filesystem, S3, SQLite, etc.).
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Producer-Consumer Pattern
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────────┐ TriggerArchiving() ┌───────────────┐
|
||||||
|
│ API Handler │ ───────────────────────▶ │ archiveChannel│
|
||||||
|
│ (Job Stop) │ │ (buffer: 128)│
|
||||||
|
└──────────────┘ └───────┬───────┘
|
||||||
|
│
|
||||||
|
┌─────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────────────┐
|
||||||
|
│ archivingWorker() │
|
||||||
|
│ (goroutine) │
|
||||||
|
└──────────┬───────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
1. Fetch job metadata
|
||||||
|
2. Load metric data
|
||||||
|
3. Calculate statistics
|
||||||
|
4. Archive to backend
|
||||||
|
5. Update database
|
||||||
|
6. Call hooks
|
||||||
|
```
|
||||||
|
|
||||||
|
### Components
|
||||||
|
|
||||||
|
- **archiveChannel**: Buffered channel (128 jobs) for async communication
|
||||||
|
- **archivePending**: WaitGroup tracking in-flight archiving operations
|
||||||
|
- **archivingWorker**: Background goroutine processing archiving requests
|
||||||
|
- **shutdownCtx**: Context for graceful cancellation during shutdown
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Initialization
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Start archiver with context for shutdown control
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
archiver.Start(jobRepository, ctx)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Archiving a Job
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Called automatically when a job completes
|
||||||
|
archiver.TriggerArchiving(job)
|
||||||
|
```
|
||||||
|
|
||||||
|
The function returns immediately. Actual archiving happens in the background.
|
||||||
|
|
||||||
|
### Graceful Shutdown
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Shutdown with 10 second timeout
|
||||||
|
if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||||
|
log.Printf("Archiver shutdown timeout: %v", err)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Shutdown process:**
|
||||||
|
1. Closes channel (rejects new jobs)
|
||||||
|
2. Waits for pending jobs (up to timeout)
|
||||||
|
3. Cancels context if timeout exceeded
|
||||||
|
4. Waits for worker to exit cleanly
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Channel Buffer Size
|
||||||
|
|
||||||
|
The archiving channel has a buffer of 128 jobs. If more than 128 jobs are queued simultaneously, `TriggerArchiving()` will block until space is available.
|
||||||
|
|
||||||
|
To adjust:
|
||||||
|
```go
|
||||||
|
// In archiveWorker.go Start() function
|
||||||
|
archiveChannel = make(chan *schema.Job, 256) // Increase buffer
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scope Selection
|
||||||
|
|
||||||
|
Archive data scopes are automatically selected based on job size:
|
||||||
|
|
||||||
|
- **Node scope**: Always included
|
||||||
|
- **Core scope**: Included for jobs with ≤8 nodes (reduces data volume for large jobs)
|
||||||
|
- **Accelerator scope**: Included if job used accelerators (`NumAcc > 0`)
|
||||||
|
|
||||||
|
To adjust the node threshold:
|
||||||
|
```go
|
||||||
|
// In archiver.go ArchiveJob() function
|
||||||
|
if job.NumNodes <= 16 { // Change from 8 to 16
|
||||||
|
scopes = append(scopes, schema.MetricScopeCore)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Resolution
|
||||||
|
|
||||||
|
Data is archived at the highest available resolution (typically 60s intervals). To change:
|
||||||
|
|
||||||
|
```go
|
||||||
|
// In archiver.go ArchiveJob() function
|
||||||
|
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 300)
|
||||||
|
// 0 = highest resolution
|
||||||
|
// 300 = 5-minute resolution
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
### Automatic Retry
|
||||||
|
|
||||||
|
The archiver does **not** automatically retry failed archiving operations. If archiving fails:
|
||||||
|
|
||||||
|
1. Error is logged
|
||||||
|
2. Job is marked as `MonitoringStatusArchivingFailed` in database
|
||||||
|
3. Worker continues processing other jobs
|
||||||
|
|
||||||
|
### Manual Retry
|
||||||
|
|
||||||
|
To re-archive failed jobs, query for jobs with `MonitoringStatusArchivingFailed` and call `TriggerArchiving()` again.
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
### Single Worker Thread
|
||||||
|
|
||||||
|
The archiver uses a single worker goroutine. For high-throughput systems:
|
||||||
|
|
||||||
|
- Large channel buffer (128) prevents blocking
|
||||||
|
- Archiving is typically I/O bound (writing to storage)
|
||||||
|
- Single worker prevents overwhelming storage backend
|
||||||
|
|
||||||
|
### Shutdown Timeout
|
||||||
|
|
||||||
|
Recommended timeout values:
|
||||||
|
- **Development**: 5-10 seconds
|
||||||
|
- **Production**: 10-30 seconds
|
||||||
|
- **High-load**: 30-60 seconds
|
||||||
|
|
||||||
|
Choose based on:
|
||||||
|
- Average archiving time per job
|
||||||
|
- Storage backend latency
|
||||||
|
- Acceptable shutdown delay
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
### Logging
|
||||||
|
|
||||||
|
The archiver logs:
|
||||||
|
- **Info**: Startup, shutdown, successful completions
|
||||||
|
- **Debug**: Individual job archiving times
|
||||||
|
- **Error**: Archiving failures with job ID and reason
|
||||||
|
- **Warn**: Shutdown timeout exceeded
|
||||||
|
|
||||||
|
### Metrics
|
||||||
|
|
||||||
|
Monitor these signals for archiver health:
|
||||||
|
- Jobs with `MonitoringStatusArchivingFailed`
|
||||||
|
- Time from job stop to successful archive
|
||||||
|
- Shutdown timeout occurrences
|
||||||
|
|
||||||
|
## Thread Safety
|
||||||
|
|
||||||
|
All exported functions are safe for concurrent use:
|
||||||
|
- `Start()` - Safe to call once
|
||||||
|
- `TriggerArchiving()` - Safe from multiple goroutines
|
||||||
|
- `Shutdown()` - Safe to call once
|
||||||
|
|
||||||
|
Internal state is protected by:
|
||||||
|
- Channel synchronization (`archiveChannel`)
|
||||||
|
- WaitGroup for pending count (`archivePending`)
|
||||||
|
- Context for cancellation (`shutdownCtx`)
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- **archiveWorker.go**: Worker lifecycle, channel management, shutdown logic
|
||||||
|
- **archiver.go**: Core archiving logic, metric loading, statistics calculation
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- `internal/repository`: Database operations for job metadata
|
||||||
|
- `internal/metricdispatch`: Loading metric data from various backends
|
||||||
|
- `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
|
||||||
|
- `cc-lib/schema`: Job and metric data structures
|
||||||
250
internal/archiver/archiveWorker.go
Normal file
250
internal/archiver/archiveWorker.go
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package archiver provides asynchronous job archiving functionality for ClusterCockpit.
|
||||||
|
//
|
||||||
|
// The archiver runs a background worker goroutine that processes job archiving requests
|
||||||
|
// from a buffered channel. When jobs complete, their metric data is archived from the
|
||||||
|
// metric store to the configured archive backend (filesystem, S3, etc.).
|
||||||
|
//
|
||||||
|
// # Architecture
|
||||||
|
//
|
||||||
|
// The archiver uses a producer-consumer pattern:
|
||||||
|
// - Producer: TriggerArchiving() sends jobs to archiveChannel
|
||||||
|
// - Consumer: archivingWorker() processes jobs from the channel
|
||||||
|
// - Coordination: sync.WaitGroup tracks pending archive operations
|
||||||
|
//
|
||||||
|
// # Lifecycle
|
||||||
|
//
|
||||||
|
// 1. Start(repo, ctx) - Initialize worker with context for cancellation
|
||||||
|
// 2. TriggerArchiving(job) - Queue job for archiving (called when job stops)
|
||||||
|
// 3. archivingWorker() - Background goroutine processes jobs
|
||||||
|
// 4. Shutdown(timeout) - Graceful shutdown with timeout
|
||||||
|
//
|
||||||
|
// # Graceful Shutdown
|
||||||
|
//
|
||||||
|
// The archiver supports graceful shutdown with configurable timeout:
|
||||||
|
// - Closes channel to reject new jobs
|
||||||
|
// - Waits for pending jobs to complete (up to timeout)
|
||||||
|
// - Cancels context if timeout exceeded
|
||||||
|
// - Ensures worker goroutine exits cleanly
|
||||||
|
//
|
||||||
|
// # Example Usage
|
||||||
|
//
|
||||||
|
// // Initialize archiver
|
||||||
|
// ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
// defer cancel()
|
||||||
|
// archiver.Start(jobRepository, ctx)
|
||||||
|
//
|
||||||
|
// // Trigger archiving when job completes
|
||||||
|
// archiver.TriggerArchiving(job)
|
||||||
|
//
|
||||||
|
// // Graceful shutdown with 10 second timeout
|
||||||
|
// if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||||
|
// log.Printf("Archiver shutdown timeout: %v", err)
|
||||||
|
// }
|
||||||
|
package archiver
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
sq "github.com/Masterminds/squirrel"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
archivePending sync.WaitGroup
|
||||||
|
archiveChannel chan *schema.Job
|
||||||
|
jobRepo *repository.JobRepository
|
||||||
|
shutdownCtx context.Context
|
||||||
|
shutdownCancel context.CancelFunc
|
||||||
|
workerDone chan struct{}
|
||||||
|
)
|
||||||
|
|
||||||
|
// Start initializes the archiver and starts the background worker goroutine.
|
||||||
|
//
|
||||||
|
// The archiver processes job archiving requests asynchronously via a buffered channel.
|
||||||
|
// Jobs are sent to the channel using TriggerArchiving() and processed by the worker.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - r: JobRepository instance for database operations
|
||||||
|
// - ctx: Context for cancellation (shutdown signal propagation)
|
||||||
|
//
|
||||||
|
// The worker goroutine will run until:
|
||||||
|
// - ctx is cancelled (via parent shutdown)
|
||||||
|
// - archiveChannel is closed (via Shutdown())
|
||||||
|
//
|
||||||
|
// Must be called before TriggerArchiving(). Safe to call only once.
|
||||||
|
func Start(r *repository.JobRepository, ctx context.Context) {
|
||||||
|
shutdownCtx, shutdownCancel = context.WithCancel(ctx)
|
||||||
|
archiveChannel = make(chan *schema.Job, 128)
|
||||||
|
workerDone = make(chan struct{})
|
||||||
|
jobRepo = r
|
||||||
|
|
||||||
|
go archivingWorker()
|
||||||
|
}
|
||||||
|
|
||||||
|
// archivingWorker is the background goroutine that processes job archiving requests.
|
||||||
|
//
|
||||||
|
// The worker loop:
|
||||||
|
// 1. Blocks waiting for jobs on archiveChannel or shutdown signal
|
||||||
|
// 2. Fetches job metadata from repository
|
||||||
|
// 3. Archives job data to configured backend (calls ArchiveJob)
|
||||||
|
// 4. Updates job footprint and energy metrics in database
|
||||||
|
// 5. Marks job as successfully archived
|
||||||
|
// 6. Calls job stop hooks
|
||||||
|
//
|
||||||
|
// The worker exits when:
|
||||||
|
// - shutdownCtx is cancelled (timeout during shutdown)
|
||||||
|
// - archiveChannel is closed (normal shutdown)
|
||||||
|
//
|
||||||
|
// Errors during archiving are logged and the job is marked as failed,
|
||||||
|
// but the worker continues processing other jobs.
|
||||||
|
func archivingWorker() {
|
||||||
|
defer close(workerDone)
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-shutdownCtx.Done():
|
||||||
|
cclog.Info("Archive worker received shutdown signal")
|
||||||
|
return
|
||||||
|
|
||||||
|
case job, ok := <-archiveChannel:
|
||||||
|
if !ok {
|
||||||
|
cclog.Info("Archive channel closed, worker exiting")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
// not using meta data, called to load JobMeta into Cache?
|
||||||
|
// will fail if job meta not in repository
|
||||||
|
if _, err := jobRepo.FetchMetadata(job); err != nil {
|
||||||
|
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", *job.ID, err.Error())
|
||||||
|
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||||
|
archivePending.Done()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
|
||||||
|
// Use shutdown context to allow cancellation
|
||||||
|
jobMeta, err := ArchiveJob(job, shutdownCtx)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", *job.ID, err.Error())
|
||||||
|
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||||
|
archivePending.Done()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
stmt := sq.Update("job").Where("job.id = ?", job.ID)
|
||||||
|
|
||||||
|
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
|
||||||
|
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", *job.ID, err.Error())
|
||||||
|
archivePending.Done()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
|
||||||
|
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", *job.ID, err.Error())
|
||||||
|
archivePending.Done()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Update the jobs database entry one last time:
|
||||||
|
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
|
||||||
|
if err := jobRepo.Execute(stmt); err != nil {
|
||||||
|
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", *job.ID, err.Error())
|
||||||
|
archivePending.Done()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||||
|
cclog.Infof("archiving job (dbid: %d) successful", *job.ID)
|
||||||
|
|
||||||
|
repository.CallJobStopHooks(job)
|
||||||
|
archivePending.Done()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TriggerArchiving queues a job for asynchronous archiving.
|
||||||
|
//
|
||||||
|
// This function should be called when a job completes (stops) to archive its
|
||||||
|
// metric data from the metric store to the configured archive backend.
|
||||||
|
//
|
||||||
|
// The function:
|
||||||
|
// 1. Increments the pending job counter (WaitGroup)
|
||||||
|
// 2. Sends the job to the archiving channel (buffered, capacity 128)
|
||||||
|
// 3. Returns immediately (non-blocking unless channel is full)
|
||||||
|
//
|
||||||
|
// The actual archiving is performed asynchronously by the worker goroutine.
|
||||||
|
// Upon completion, the worker will decrement the pending counter.
|
||||||
|
//
|
||||||
|
// Panics if Start() has not been called first.
|
||||||
|
func TriggerArchiving(job *schema.Job) {
|
||||||
|
if archiveChannel == nil {
|
||||||
|
cclog.Fatal("Cannot archive without archiving channel. Did you Start the archiver?")
|
||||||
|
}
|
||||||
|
|
||||||
|
archivePending.Add(1)
|
||||||
|
archiveChannel <- job
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutdown performs a graceful shutdown of the archiver with a configurable timeout.
|
||||||
|
//
|
||||||
|
// The shutdown process:
|
||||||
|
// 1. Closes archiveChannel - no new jobs will be accepted
|
||||||
|
// 2. Waits for pending jobs to complete (up to timeout duration)
|
||||||
|
// 3. If timeout is exceeded:
|
||||||
|
// - Cancels shutdownCtx to interrupt ongoing ArchiveJob operations
|
||||||
|
// - Returns error indicating timeout
|
||||||
|
// 4. Waits for worker goroutine to exit cleanly
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - timeout: Maximum duration to wait for pending jobs to complete
|
||||||
|
// (recommended: 10-30 seconds for production)
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - nil if all jobs completed within timeout
|
||||||
|
// - error if timeout was exceeded (some jobs may not have been archived)
|
||||||
|
//
|
||||||
|
// Jobs that don't complete within the timeout will be marked as failed.
|
||||||
|
// The function always ensures the worker goroutine exits before returning.
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||||
|
// log.Printf("Some jobs did not complete: %v", err)
|
||||||
|
// }
|
||||||
|
func Shutdown(timeout time.Duration) error {
|
||||||
|
cclog.Info("Initiating archiver shutdown...")
|
||||||
|
|
||||||
|
// Close channel to signal no more jobs will be accepted
|
||||||
|
close(archiveChannel)
|
||||||
|
|
||||||
|
// Create a channel to signal when all jobs are done
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
archivePending.Wait()
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Wait for jobs to complete or timeout
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
cclog.Info("All archive jobs completed successfully")
|
||||||
|
// Wait for worker to exit
|
||||||
|
<-workerDone
|
||||||
|
return nil
|
||||||
|
case <-time.After(timeout):
|
||||||
|
cclog.Warn("Archiver shutdown timeout exceeded, cancelling remaining operations")
|
||||||
|
// Cancel any ongoing operations
|
||||||
|
shutdownCancel()
|
||||||
|
// Wait for worker to exit
|
||||||
|
<-workerDone
|
||||||
|
return fmt.Errorf("archiver shutdown timeout after %v", timeout)
|
||||||
|
}
|
||||||
|
}
|
||||||
97
internal/archiver/archiver.go
Normal file
97
internal/archiver/archiver.go
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package archiver
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"math"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ArchiveJob archives a completed job's metric data to the configured archive backend.
|
||||||
|
//
|
||||||
|
// This function performs the following operations:
|
||||||
|
// 1. Loads all metric data for the job from the metric data repository
|
||||||
|
// 2. Calculates job-level statistics (avg, min, max) for each metric
|
||||||
|
// 3. Stores the job metadata and metric data to the archive backend
|
||||||
|
//
|
||||||
|
// Metric data is retrieved at the highest available resolution (typically 60s)
|
||||||
|
// for the following scopes:
|
||||||
|
// - Node scope (always)
|
||||||
|
// - Core scope (for jobs with ≤8 nodes, to reduce data volume)
|
||||||
|
// - Accelerator scope (if job used accelerators)
|
||||||
|
//
|
||||||
|
// The function respects context cancellation. If ctx is cancelled (e.g., during
|
||||||
|
// shutdown timeout), the operation will be interrupted and return an error.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - job: The job to archive (must be a completed job)
|
||||||
|
// - ctx: Context for cancellation and timeout control
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - *schema.Job with populated Statistics field
|
||||||
|
// - error if data loading or archiving fails
|
||||||
|
//
|
||||||
|
// If config.Keys.DisableArchive is true, only job statistics are calculated
|
||||||
|
// and returned (no data is written to archive backend).
|
||||||
|
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
|
||||||
|
allMetrics := make([]string, 0)
|
||||||
|
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||||
|
for _, mc := range metricConfigs {
|
||||||
|
allMetrics = append(allMetrics, mc.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||||
|
// FIXME: Add a config option for this
|
||||||
|
if job.NumNodes <= 8 {
|
||||||
|
// This will add the native scope if core scope is not available
|
||||||
|
scopes = append(scopes, schema.MetricScopeCore)
|
||||||
|
}
|
||||||
|
|
||||||
|
if job.NumAcc > 0 {
|
||||||
|
scopes = append(scopes, schema.MetricScopeAccelerator)
|
||||||
|
}
|
||||||
|
|
||||||
|
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Error("Error wile loading job data for archiving")
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
job.Statistics = make(map[string]schema.JobStatistics)
|
||||||
|
|
||||||
|
for metric, data := range jobData {
|
||||||
|
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||||
|
nodeData, ok := data["node"]
|
||||||
|
if !ok {
|
||||||
|
// This should never happen ?
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, series := range nodeData.Series {
|
||||||
|
avg += series.Statistics.Avg
|
||||||
|
min = math.Min(min, series.Statistics.Min)
|
||||||
|
max = math.Max(max, series.Statistics.Max)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Round AVG Result to 2 Digits
|
||||||
|
job.Statistics[metric] = schema.JobStatistics{
|
||||||
|
Unit: schema.Unit{
|
||||||
|
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||||
|
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||||
|
},
|
||||||
|
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
|
||||||
|
Min: min,
|
||||||
|
Max: max,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return job, archive.GetHandle().ImportJob(job, &jobData)
|
||||||
|
}
|
||||||
@@ -1,31 +1,121 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package auth implements various authentication methods
|
||||||
package auth
|
package auth
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"crypto/rand"
|
"crypto/rand"
|
||||||
"database/sql"
|
"database/sql"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"golang.org/x/time/rate"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||||
"github.com/gorilla/sessions"
|
"github.com/gorilla/sessions"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Authenticator is the interface for all authentication methods.
|
||||||
|
// Each authenticator determines if it can handle a login request (CanLogin)
|
||||||
|
// and performs the actual authentication (Login).
|
||||||
type Authenticator interface {
|
type Authenticator interface {
|
||||||
|
// CanLogin determines if this authenticator can handle the login request.
|
||||||
|
// It returns the user object if available and a boolean indicating if this
|
||||||
|
// authenticator should attempt the login. This method should not perform
|
||||||
|
// expensive operations or actual authentication.
|
||||||
CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool)
|
CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool)
|
||||||
|
|
||||||
|
// Login performs the actually authentication for the user.
|
||||||
|
// It returns the authenticated user or an error if authentication fails.
|
||||||
|
// The user parameter may be nil if the user doesn't exist in the database yet.
|
||||||
Login(user *schema.User, rw http.ResponseWriter, r *http.Request) (*schema.User, error)
|
Login(user *schema.User, rw http.ResponseWriter, r *http.Request) (*schema.User, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
initOnce sync.Once
|
||||||
|
authInstance *Authentication
|
||||||
|
)
|
||||||
|
|
||||||
|
// rateLimiterEntry tracks a rate limiter and its last use time for cleanup
|
||||||
|
type rateLimiterEntry struct {
|
||||||
|
limiter *rate.Limiter
|
||||||
|
lastUsed time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
var ipUserLimiters sync.Map
|
||||||
|
|
||||||
|
// getIPUserLimiter returns a rate limiter for the given IP and username combination.
|
||||||
|
// Rate limiters are created on demand and track 5 attempts per 15 minutes.
|
||||||
|
func getIPUserLimiter(ip, username string) *rate.Limiter {
|
||||||
|
key := ip + ":" + username
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
if entry, ok := ipUserLimiters.Load(key); ok {
|
||||||
|
rle := entry.(*rateLimiterEntry)
|
||||||
|
rle.lastUsed = now
|
||||||
|
return rle.limiter
|
||||||
|
}
|
||||||
|
|
||||||
|
// More aggressive rate limiting: 5 attempts per 15 minutes
|
||||||
|
newLimiter := rate.NewLimiter(rate.Every(15*time.Minute/5), 5)
|
||||||
|
ipUserLimiters.Store(key, &rateLimiterEntry{
|
||||||
|
limiter: newLimiter,
|
||||||
|
lastUsed: now,
|
||||||
|
})
|
||||||
|
return newLimiter
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanupOldRateLimiters removes rate limiters that haven't been used recently
|
||||||
|
func cleanupOldRateLimiters(olderThan time.Time) {
|
||||||
|
ipUserLimiters.Range(func(key, value any) bool {
|
||||||
|
entry := value.(*rateLimiterEntry)
|
||||||
|
if entry.lastUsed.Before(olderThan) {
|
||||||
|
ipUserLimiters.Delete(key)
|
||||||
|
cclog.Debugf("Cleaned up rate limiter for %v", key)
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// startRateLimiterCleanup starts a background goroutine to clean up old rate limiters
|
||||||
|
func startRateLimiterCleanup() {
|
||||||
|
go func() {
|
||||||
|
ticker := time.NewTicker(1 * time.Hour)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for range ticker.C {
|
||||||
|
// Clean up limiters not used in the last 24 hours
|
||||||
|
cleanupOldRateLimiters(time.Now().Add(-24 * time.Hour))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// AuthConfig contains configuration for all authentication methods
|
||||||
|
type AuthConfig struct {
|
||||||
|
LdapConfig *LdapConfig `json:"ldap"`
|
||||||
|
JwtConfig *JWTAuthConfig `json:"jwts"`
|
||||||
|
OpenIDConfig *OpenIDConfig `json:"oidc"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keys holds the global authentication configuration
|
||||||
|
var Keys AuthConfig
|
||||||
|
|
||||||
|
// Authentication manages all authentication methods and session handling
|
||||||
type Authentication struct {
|
type Authentication struct {
|
||||||
sessionStore *sessions.CookieStore
|
sessionStore *sessions.CookieStore
|
||||||
LdapAuth *LdapAuthenticator
|
LdapAuth *LdapAuthenticator
|
||||||
@@ -41,7 +131,7 @@ func (auth *Authentication) AuthViaSession(
|
|||||||
) (*schema.User, error) {
|
) (*schema.User, error) {
|
||||||
session, err := auth.sessionStore.Get(r, "session")
|
session, err := auth.sessionStore.Get(r, "session")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("Error while getting session store")
|
cclog.Error("Error while getting session store")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -49,10 +139,31 @@ func (auth *Authentication) AuthViaSession(
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Check if session keys exist
|
// Validate session data with proper type checking
|
||||||
username, _ := session.Values["username"].(string)
|
username, ok := session.Values["username"].(string)
|
||||||
projects, _ := session.Values["projects"].([]string)
|
if !ok || username == "" {
|
||||||
roles, _ := session.Values["roles"].([]string)
|
cclog.Warn("Invalid session: missing or invalid username")
|
||||||
|
// Invalidate the corrupted session
|
||||||
|
session.Options.MaxAge = -1
|
||||||
|
_ = auth.sessionStore.Save(r, rw, session)
|
||||||
|
return nil, errors.New("invalid session data")
|
||||||
|
}
|
||||||
|
|
||||||
|
projects, ok := session.Values["projects"].([]string)
|
||||||
|
if !ok {
|
||||||
|
cclog.Warn("Invalid session: projects not found or invalid type, using empty list")
|
||||||
|
projects = []string{}
|
||||||
|
}
|
||||||
|
|
||||||
|
roles, ok := session.Values["roles"].([]string)
|
||||||
|
if !ok || len(roles) == 0 {
|
||||||
|
cclog.Warn("Invalid session: missing or invalid roles")
|
||||||
|
// Invalidate the corrupted session
|
||||||
|
session.Options.MaxAge = -1
|
||||||
|
_ = auth.sessionStore.Save(r, rw, session)
|
||||||
|
return nil, errors.New("invalid session data")
|
||||||
|
}
|
||||||
|
|
||||||
return &schema.User{
|
return &schema.User{
|
||||||
Username: username,
|
Username: username,
|
||||||
Projects: projects,
|
Projects: projects,
|
||||||
@@ -62,90 +173,136 @@ func (auth *Authentication) AuthViaSession(
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func Init() (*Authentication, error) {
|
func Init(authCfg *json.RawMessage) {
|
||||||
auth := &Authentication{}
|
initOnce.Do(func() {
|
||||||
|
authInstance = &Authentication{}
|
||||||
|
|
||||||
sessKey := os.Getenv("SESSION_KEY")
|
// Start background cleanup of rate limiters
|
||||||
if sessKey == "" {
|
startRateLimiterCleanup()
|
||||||
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
|
|
||||||
bytes := make([]byte, 32)
|
|
||||||
if _, err := rand.Read(bytes); err != nil {
|
|
||||||
log.Error("Error while initializing authentication -> failed to generate random bytes for session key")
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
|
||||||
} else {
|
|
||||||
bytes, err := base64.StdEncoding.DecodeString(sessKey)
|
|
||||||
if err != nil {
|
|
||||||
log.Error("Error while initializing authentication -> decoding session key failed")
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
|
||||||
}
|
|
||||||
|
|
||||||
if config.Keys.LdapConfig != nil {
|
sessKey := os.Getenv("SESSION_KEY")
|
||||||
ldapAuth := &LdapAuthenticator{}
|
if sessKey == "" {
|
||||||
if err := ldapAuth.Init(); err != nil {
|
cclog.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
|
||||||
log.Warn("Error while initializing authentication -> ldapAuth init failed")
|
bytes := make([]byte, 32)
|
||||||
|
if _, err := rand.Read(bytes); err != nil {
|
||||||
|
cclog.Fatal("Error while initializing authentication -> failed to generate random bytes for session key")
|
||||||
|
}
|
||||||
|
authInstance.sessionStore = sessions.NewCookieStore(bytes)
|
||||||
} else {
|
} else {
|
||||||
auth.LdapAuth = ldapAuth
|
bytes, err := base64.StdEncoding.DecodeString(sessKey)
|
||||||
auth.authenticators = append(auth.authenticators, auth.LdapAuth)
|
if err != nil {
|
||||||
}
|
cclog.Fatal("Error while initializing authentication -> decoding session key failed")
|
||||||
} else {
|
}
|
||||||
log.Info("Missing LDAP configuration: No LDAP support!")
|
authInstance.sessionStore = sessions.NewCookieStore(bytes)
|
||||||
}
|
|
||||||
|
|
||||||
if config.Keys.JwtConfig != nil {
|
|
||||||
auth.JwtAuth = &JWTAuthenticator{}
|
|
||||||
if err := auth.JwtAuth.Init(); err != nil {
|
|
||||||
log.Error("Error while initializing authentication -> jwtAuth init failed")
|
|
||||||
return nil, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
jwtSessionAuth := &JWTSessionAuthenticator{}
|
if d, err := time.ParseDuration(config.Keys.SessionMaxAge); err == nil {
|
||||||
if err := jwtSessionAuth.Init(); err != nil {
|
authInstance.SessionMaxAge = d
|
||||||
log.Info("jwtSessionAuth init failed: No JWT login support!")
|
}
|
||||||
|
|
||||||
|
if authCfg == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
config.Validate(configSchema, *authCfg)
|
||||||
|
dec := json.NewDecoder(bytes.NewReader(*authCfg))
|
||||||
|
dec.DisallowUnknownFields()
|
||||||
|
if err := dec.Decode(&Keys); err != nil {
|
||||||
|
cclog.Errorf("error while decoding ldap config: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if Keys.LdapConfig != nil {
|
||||||
|
ldapAuth := &LdapAuthenticator{}
|
||||||
|
if err := ldapAuth.Init(); err != nil {
|
||||||
|
cclog.Warn("Error while initializing authentication -> ldapAuth init failed")
|
||||||
|
} else {
|
||||||
|
authInstance.LdapAuth = ldapAuth
|
||||||
|
authInstance.authenticators = append(authInstance.authenticators, authInstance.LdapAuth)
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
auth.authenticators = append(auth.authenticators, jwtSessionAuth)
|
cclog.Info("Missing LDAP configuration: No LDAP support!")
|
||||||
}
|
}
|
||||||
|
|
||||||
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
|
if Keys.JwtConfig != nil {
|
||||||
if err := jwtCookieSessionAuth.Init(); err != nil {
|
authInstance.JwtAuth = &JWTAuthenticator{}
|
||||||
log.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
|
if err := authInstance.JwtAuth.Init(); err != nil {
|
||||||
|
cclog.Fatal("Error while initializing authentication -> jwtAuth init failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
jwtSessionAuth := &JWTSessionAuthenticator{}
|
||||||
|
if err := jwtSessionAuth.Init(); err != nil {
|
||||||
|
cclog.Info("jwtSessionAuth init failed: No JWT login support!")
|
||||||
|
} else {
|
||||||
|
authInstance.authenticators = append(authInstance.authenticators, jwtSessionAuth)
|
||||||
|
}
|
||||||
|
|
||||||
|
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
|
||||||
|
if err := jwtCookieSessionAuth.Init(); err != nil {
|
||||||
|
cclog.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
|
||||||
|
} else {
|
||||||
|
authInstance.authenticators = append(authInstance.authenticators, jwtCookieSessionAuth)
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
auth.authenticators = append(auth.authenticators, jwtCookieSessionAuth)
|
cclog.Info("Missing JWT configuration: No JWT token support!")
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
log.Info("Missing JWT configuration: No JWT token support!")
|
|
||||||
}
|
|
||||||
|
|
||||||
auth.LocalAuth = &LocalAuthenticator{}
|
authInstance.LocalAuth = &LocalAuthenticator{}
|
||||||
if err := auth.LocalAuth.Init(); err != nil {
|
if err := authInstance.LocalAuth.Init(); err != nil {
|
||||||
log.Error("Error while initializing authentication -> localAuth init failed")
|
cclog.Fatal("Error while initializing authentication -> localAuth init failed")
|
||||||
return nil, err
|
}
|
||||||
}
|
authInstance.authenticators = append(authInstance.authenticators, authInstance.LocalAuth)
|
||||||
auth.authenticators = append(auth.authenticators, auth.LocalAuth)
|
})
|
||||||
|
|
||||||
return auth, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func persistUser(user *schema.User) {
|
func GetAuthInstance() *Authentication {
|
||||||
|
if authInstance == nil {
|
||||||
|
cclog.Fatal("Authentication module not initialized!")
|
||||||
|
}
|
||||||
|
|
||||||
|
return authInstance
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleUserSync syncs or updates a user in the database based on configuration.
|
||||||
|
// This is used for LDAP, JWT and OIDC authentications when syncUserOnLogin or updateUserOnLogin is enabled.
|
||||||
|
func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) {
|
||||||
r := repository.GetUserRepository()
|
r := repository.GetUserRepository()
|
||||||
_, err := r.GetUser(user.Username)
|
dbUser, err := r.GetUser(user.Username)
|
||||||
|
|
||||||
if err != nil && err != sql.ErrNoRows {
|
if err != nil && err != sql.ErrNoRows {
|
||||||
log.Errorf("Error while loading user '%s': %v", user.Username, err)
|
cclog.Errorf("Error while loading user '%s': %v", user.Username, err)
|
||||||
} else if err == sql.ErrNoRows {
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err == sql.ErrNoRows && syncUserOnLogin { // Add new user
|
||||||
if err := r.AddUser(user); err != nil {
|
if err := r.AddUser(user); err != nil {
|
||||||
log.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
|
cclog.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
|
||||||
|
}
|
||||||
|
} else if err == nil && updateUserOnLogin { // Update existing user
|
||||||
|
if err := r.UpdateUser(dbUser, user); err != nil {
|
||||||
|
cclog.Errorf("Error while updating user '%s' in DB: %v", dbUser.Username, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// handleTokenUser syncs JWT token user with database
|
||||||
|
func handleTokenUser(tokenUser *schema.User) {
|
||||||
|
handleUserSync(tokenUser, Keys.JwtConfig.SyncUserOnLogin, Keys.JwtConfig.UpdateUserOnLogin)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleOIDCUser syncs OIDC user with database
|
||||||
|
func handleOIDCUser(OIDCUser *schema.User) {
|
||||||
|
handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleLdapUser syncs LDAP user with database
|
||||||
|
func handleLdapUser(ldapUser *schema.User) {
|
||||||
|
handleUserSync(ldapUser, Keys.LdapConfig.SyncUserOnLogin, Keys.LdapConfig.UpdateUserOnLogin)
|
||||||
|
}
|
||||||
|
|
||||||
func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
|
func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
|
||||||
session, err := auth.sessionStore.New(r, "session")
|
session, err := auth.sessionStore.New(r, "session")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("session creation failed: %s", err.Error())
|
cclog.Errorf("session creation failed: %s", err.Error())
|
||||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -153,11 +310,21 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
|||||||
if auth.SessionMaxAge != 0 {
|
if auth.SessionMaxAge != 0 {
|
||||||
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
|
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
|
||||||
}
|
}
|
||||||
|
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
|
||||||
|
// If neither TLS or an encrypted reverse proxy are used, do not mark cookies as secure.
|
||||||
|
cclog.Warn("Authenticating with unencrypted request. Session cookies will not have Secure flag set (insecure for production)")
|
||||||
|
if r.Header.Get("X-Forwarded-Proto") == "" {
|
||||||
|
// This warning will not be printed if e.g. X-Forwarded-Proto == http
|
||||||
|
cclog.Warn("If you are using a reverse proxy, make sure X-Forwarded-Proto is set")
|
||||||
|
}
|
||||||
|
session.Options.Secure = false
|
||||||
|
}
|
||||||
|
session.Options.SameSite = http.SameSiteStrictMode
|
||||||
session.Values["username"] = user.Username
|
session.Values["username"] = user.Username
|
||||||
session.Values["projects"] = user.Projects
|
session.Values["projects"] = user.Projects
|
||||||
session.Values["roles"] = user.Roles
|
session.Values["roles"] = user.Roles
|
||||||
if err := auth.sessionStore.Save(r, rw, session); err != nil {
|
if err := auth.sessionStore.Save(r, rw, session); err != nil {
|
||||||
log.Warnf("session save failed: %s", err.Error())
|
cclog.Warnf("session save failed: %s", err.Error())
|
||||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -166,18 +333,29 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (auth *Authentication) Login(
|
func (auth *Authentication) Login(
|
||||||
onsuccess http.Handler,
|
|
||||||
onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error),
|
onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error),
|
||||||
) http.Handler {
|
) http.Handler {
|
||||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
username := r.FormValue("username")
|
ip, _, err := net.SplitHostPort(r.RemoteAddr)
|
||||||
var dbUser *schema.User
|
if err != nil {
|
||||||
|
ip = r.RemoteAddr
|
||||||
|
}
|
||||||
|
|
||||||
|
username := r.FormValue("username")
|
||||||
|
|
||||||
|
limiter := getIPUserLimiter(ip, username)
|
||||||
|
if !limiter.Allow() {
|
||||||
|
cclog.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username)
|
||||||
|
onfailure(rw, r, errors.New("too many login attempts, try again in a few minutes"))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var dbUser *schema.User
|
||||||
if username != "" {
|
if username != "" {
|
||||||
var err error
|
var err error
|
||||||
dbUser, err = repository.GetUserRepository().GetUser(username)
|
dbUser, err = repository.GetUserRepository().GetUser(username)
|
||||||
if err != nil && err != sql.ErrNoRows {
|
if err != nil && err != sql.ErrNoRows {
|
||||||
log.Errorf("Error while loading user '%v'", username)
|
cclog.Errorf("Error while loading user '%v'", username)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -187,12 +365,12 @@ func (auth *Authentication) Login(
|
|||||||
if user, ok = authenticator.CanLogin(dbUser, username, rw, r); !ok {
|
if user, ok = authenticator.CanLogin(dbUser, username, rw, r); !ok {
|
||||||
continue
|
continue
|
||||||
} else {
|
} else {
|
||||||
log.Debugf("Can login with user %v", user)
|
cclog.Debugf("Can login with user %v", user)
|
||||||
}
|
}
|
||||||
|
|
||||||
user, err := authenticator.Login(user, rw, r)
|
user, err := authenticator.Login(user, rw, r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warnf("user login failed: %s", err.Error())
|
cclog.Warnf("user login failed: %s", err.Error())
|
||||||
onfailure(rw, r, err)
|
onfailure(rw, r, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -201,13 +379,19 @@ func (auth *Authentication) Login(
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
|
||||||
|
if r.FormValue("redirect") != "" {
|
||||||
|
http.RedirectHandler(r.FormValue("redirect"), http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
http.RedirectHandler("/", http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debugf("login failed: no authenticator applied")
|
cclog.Debugf("login failed: no authenticator applied")
|
||||||
onfailure(rw, r, errors.New("no authenticator applied"))
|
onfailure(rw, r, errors.New("no authenticator applied"))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -219,31 +403,186 @@ func (auth *Authentication) Auth(
|
|||||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Infof("authentication failed: %s", err.Error())
|
cclog.Infof("auth -> authentication failed: %s", err.Error())
|
||||||
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if user == nil {
|
if user == nil {
|
||||||
user, err = auth.AuthViaSession(rw, r)
|
user, err = auth.AuthViaSession(rw, r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Infof("authentication failed: %s", err.Error())
|
cclog.Infof("auth -> authentication failed: %s", err.Error())
|
||||||
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if user != nil {
|
if user != nil {
|
||||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debug("authentication failed")
|
cclog.Info("auth -> authentication failed")
|
||||||
onfailure(rw, r, errors.New("unauthorized (please login first)"))
|
onfailure(rw, r, errors.New("unauthorized (please login first)"))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (auth *Authentication) AuthAPI(
|
||||||
|
onsuccess http.Handler,
|
||||||
|
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||||
|
) http.Handler {
|
||||||
|
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Infof("auth api -> authentication failed: %s", err.Error())
|
||||||
|
onfailure(rw, r, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ipErr := securedCheck(user, r)
|
||||||
|
if ipErr != nil {
|
||||||
|
cclog.Infof("auth api -> secured check failed: %s", ipErr.Error())
|
||||||
|
onfailure(rw, r, ipErr)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if user != nil {
|
||||||
|
switch {
|
||||||
|
case len(user.Roles) == 1:
|
||||||
|
if user.HasRole(schema.RoleAPI) {
|
||||||
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
|
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case len(user.Roles) >= 2:
|
||||||
|
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleAPI}) {
|
||||||
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
|
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
cclog.Info("auth api -> authentication failed: missing role")
|
||||||
|
onfailure(rw, r, errors.New("unauthorized"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cclog.Info("auth api -> authentication failed: no auth")
|
||||||
|
onfailure(rw, r, errors.New("unauthorized"))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (auth *Authentication) AuthUserAPI(
|
||||||
|
onsuccess http.Handler,
|
||||||
|
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||||
|
) http.Handler {
|
||||||
|
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Infof("auth user api -> authentication failed: %s", err.Error())
|
||||||
|
onfailure(rw, r, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if user != nil {
|
||||||
|
switch {
|
||||||
|
case len(user.Roles) == 1:
|
||||||
|
if user.HasRole(schema.RoleAPI) {
|
||||||
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
|
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case len(user.Roles) >= 2:
|
||||||
|
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
|
||||||
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
|
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
cclog.Info("auth user api -> authentication failed: missing role")
|
||||||
|
onfailure(rw, r, errors.New("unauthorized"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cclog.Info("auth user api -> authentication failed: no auth")
|
||||||
|
onfailure(rw, r, errors.New("unauthorized"))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (auth *Authentication) AuthMetricStoreAPI(
|
||||||
|
onsuccess http.Handler,
|
||||||
|
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||||
|
) http.Handler {
|
||||||
|
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Infof("auth metricstore api -> authentication failed: %s", err.Error())
|
||||||
|
onfailure(rw, r, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if user != nil {
|
||||||
|
switch {
|
||||||
|
case len(user.Roles) == 1:
|
||||||
|
if user.HasRole(schema.RoleAPI) {
|
||||||
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
|
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case len(user.Roles) >= 2:
|
||||||
|
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
|
||||||
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
|
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
cclog.Info("auth metricstore api -> authentication failed: missing role")
|
||||||
|
onfailure(rw, r, errors.New("unauthorized"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cclog.Info("auth metricstore api -> authentication failed: no auth")
|
||||||
|
onfailure(rw, r, errors.New("unauthorized"))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (auth *Authentication) AuthConfigAPI(
|
||||||
|
onsuccess http.Handler,
|
||||||
|
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||||
|
) http.Handler {
|
||||||
|
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
user, err := auth.AuthViaSession(rw, r)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Infof("auth config api -> authentication failed: %s", err.Error())
|
||||||
|
onfailure(rw, r, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if user != nil && user.HasRole(schema.RoleAdmin) {
|
||||||
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
|
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cclog.Info("auth config api -> authentication failed: no auth")
|
||||||
|
onfailure(rw, r, errors.New("unauthorized"))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (auth *Authentication) AuthFrontendAPI(
|
||||||
|
onsuccess http.Handler,
|
||||||
|
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||||
|
) http.Handler {
|
||||||
|
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
|
user, err := auth.AuthViaSession(rw, r)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Infof("auth frontend api -> authentication failed: %s", err.Error())
|
||||||
|
onfailure(rw, r, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if user != nil {
|
||||||
|
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
|
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cclog.Info("auth frontend api -> authentication failed: no auth")
|
||||||
|
onfailure(rw, r, errors.New("unauthorized"))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
|
func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
|
||||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||||
session, err := auth.sessionStore.Get(r, "session")
|
session, err := auth.sessionStore.Get(r, "session")
|
||||||
@@ -263,3 +602,42 @@ func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
|
|||||||
onsuccess.ServeHTTP(rw, r)
|
onsuccess.ServeHTTP(rw, r)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper Moved To MiddleWare Auth Handlers
|
||||||
|
func securedCheck(user *schema.User, r *http.Request) error {
|
||||||
|
if user == nil {
|
||||||
|
return fmt.Errorf("no user for secured check")
|
||||||
|
}
|
||||||
|
|
||||||
|
// extract IP address for checking
|
||||||
|
IPAddress := r.Header.Get("X-Real-Ip")
|
||||||
|
if IPAddress == "" {
|
||||||
|
IPAddress = r.Header.Get("X-Forwarded-For")
|
||||||
|
}
|
||||||
|
if IPAddress == "" {
|
||||||
|
IPAddress = r.RemoteAddr
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle both IPv4 and IPv6 addresses properly
|
||||||
|
// For IPv6, this will strip the port and brackets
|
||||||
|
// For IPv4, this will strip the port
|
||||||
|
if host, _, err := net.SplitHostPort(IPAddress); err == nil {
|
||||||
|
IPAddress = host
|
||||||
|
}
|
||||||
|
// If SplitHostPort fails, IPAddress is already just a host (no port)
|
||||||
|
|
||||||
|
// If nothing declared in config: Continue
|
||||||
|
if len(config.Keys.APIAllowedIPs) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// If wildcard declared in config: Continue
|
||||||
|
if config.Keys.APIAllowedIPs[0] == "*" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// check if IP is allowed
|
||||||
|
if !util.Contains(config.Keys.APIAllowedIPs, IPAddress) {
|
||||||
|
return fmt.Errorf("unknown ip: %v", IPAddress)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|||||||
176
internal/auth/auth_test.go
Normal file
176
internal/auth/auth_test.go
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package auth
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestGetIPUserLimiter tests the rate limiter creation and retrieval
|
||||||
|
func TestGetIPUserLimiter(t *testing.T) {
|
||||||
|
ip := "192.168.1.1"
|
||||||
|
username := "testuser"
|
||||||
|
|
||||||
|
// Get limiter for the first time
|
||||||
|
limiter1 := getIPUserLimiter(ip, username)
|
||||||
|
if limiter1 == nil {
|
||||||
|
t.Fatal("Expected limiter to be created")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the same limiter again
|
||||||
|
limiter2 := getIPUserLimiter(ip, username)
|
||||||
|
if limiter1 != limiter2 {
|
||||||
|
t.Error("Expected to get the same limiter instance")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a different limiter for different user
|
||||||
|
limiter3 := getIPUserLimiter(ip, "otheruser")
|
||||||
|
if limiter1 == limiter3 {
|
||||||
|
t.Error("Expected different limiter for different user")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a different limiter for different IP
|
||||||
|
limiter4 := getIPUserLimiter("192.168.1.2", username)
|
||||||
|
if limiter1 == limiter4 {
|
||||||
|
t.Error("Expected different limiter for different IP")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestRateLimiterBehavior tests that rate limiting works correctly
|
||||||
|
func TestRateLimiterBehavior(t *testing.T) {
|
||||||
|
ip := "10.0.0.1"
|
||||||
|
username := "ratelimituser"
|
||||||
|
|
||||||
|
limiter := getIPUserLimiter(ip, username)
|
||||||
|
|
||||||
|
// Should allow first 5 attempts
|
||||||
|
for i := range 5 {
|
||||||
|
if !limiter.Allow() {
|
||||||
|
t.Errorf("Request %d should be allowed within rate limit", i+1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 6th attempt should be blocked
|
||||||
|
if limiter.Allow() {
|
||||||
|
t.Error("Request 6 should be blocked by rate limiter")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestCleanupOldRateLimiters tests the cleanup function
|
||||||
|
func TestCleanupOldRateLimiters(t *testing.T) {
|
||||||
|
// Clear all existing limiters first to avoid interference from other tests
|
||||||
|
cleanupOldRateLimiters(time.Now().Add(24 * time.Hour))
|
||||||
|
|
||||||
|
// Create some new rate limiters
|
||||||
|
limiter1 := getIPUserLimiter("1.1.1.1", "user1")
|
||||||
|
limiter2 := getIPUserLimiter("2.2.2.2", "user2")
|
||||||
|
|
||||||
|
if limiter1 == nil || limiter2 == nil {
|
||||||
|
t.Fatal("Failed to create test limiters")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup limiters older than 1 second from now (should keep both)
|
||||||
|
time.Sleep(10 * time.Millisecond) // Small delay to ensure timestamp difference
|
||||||
|
cleanupOldRateLimiters(time.Now().Add(-1 * time.Second))
|
||||||
|
|
||||||
|
// Verify they still exist (should get same instance)
|
||||||
|
if getIPUserLimiter("1.1.1.1", "user1") != limiter1 {
|
||||||
|
t.Error("Limiter 1 was incorrectly cleaned up")
|
||||||
|
}
|
||||||
|
if getIPUserLimiter("2.2.2.2", "user2") != limiter2 {
|
||||||
|
t.Error("Limiter 2 was incorrectly cleaned up")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup limiters older than 1 hour from now (should remove both)
|
||||||
|
cleanupOldRateLimiters(time.Now().Add(2 * time.Hour))
|
||||||
|
|
||||||
|
// Getting them again should create new instances
|
||||||
|
newLimiter1 := getIPUserLimiter("1.1.1.1", "user1")
|
||||||
|
if newLimiter1 == limiter1 {
|
||||||
|
t.Error("Old limiter should have been cleaned up")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestIPv4Extraction tests extracting IPv4 addresses
|
||||||
|
func TestIPv4Extraction(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{"IPv4 with port", "192.168.1.1:8080", "192.168.1.1"},
|
||||||
|
{"IPv4 without port", "192.168.1.1", "192.168.1.1"},
|
||||||
|
{"Localhost with port", "127.0.0.1:3000", "127.0.0.1"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := tt.input
|
||||||
|
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||||
|
result = host
|
||||||
|
}
|
||||||
|
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestIPv6Extraction tests extracting IPv6 addresses
|
||||||
|
func TestIPv6Extraction(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{"IPv6 with port", "[2001:db8::1]:8080", "2001:db8::1"},
|
||||||
|
{"IPv6 localhost with port", "[::1]:3000", "::1"},
|
||||||
|
{"IPv6 without port", "2001:db8::1", "2001:db8::1"},
|
||||||
|
{"IPv6 localhost", "::1", "::1"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := tt.input
|
||||||
|
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||||
|
result = host
|
||||||
|
}
|
||||||
|
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestIPExtractionEdgeCases tests edge cases for IP extraction
|
||||||
|
func TestIPExtractionEdgeCases(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{"Hostname without port", "example.com", "example.com"},
|
||||||
|
{"Empty string", "", ""},
|
||||||
|
{"Just port", ":8080", ""},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := tt.input
|
||||||
|
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||||
|
result = host
|
||||||
|
}
|
||||||
|
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,7 +1,8 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package auth
|
package auth
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -13,13 +14,33 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
|
||||||
"github.com/golang-jwt/jwt/v5"
|
"github.com/golang-jwt/jwt/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type JWTAuthConfig struct {
|
||||||
|
// Specifies for how long a JWT token shall be valid
|
||||||
|
// as a string parsable by time.ParseDuration().
|
||||||
|
MaxAge string `json:"max-age"`
|
||||||
|
|
||||||
|
// Specifies which cookie should be checked for a JWT token (if no authorization header is present)
|
||||||
|
CookieName string `json:"cookie-name"`
|
||||||
|
|
||||||
|
// Deny login for users not in database (but defined in JWT).
|
||||||
|
// Ignore user roles defined in JWTs ('roles' claim), get them from db.
|
||||||
|
ValidateUser bool `json:"validate-user"`
|
||||||
|
|
||||||
|
// Specifies which issuer should be accepted when validating external JWTs ('iss' claim)
|
||||||
|
TrustedIssuer string `json:"trusted-issuer"`
|
||||||
|
|
||||||
|
// Should an non-existent user be added to the DB based on the information in the token
|
||||||
|
SyncUserOnLogin bool `json:"sync-user-on-login"`
|
||||||
|
|
||||||
|
// Should an existent user be updated in the DB based on the information in the token
|
||||||
|
UpdateUserOnLogin bool `json:"update-user-on-login"`
|
||||||
|
}
|
||||||
|
|
||||||
type JWTAuthenticator struct {
|
type JWTAuthenticator struct {
|
||||||
publicKey ed25519.PublicKey
|
publicKey ed25519.PublicKey
|
||||||
privateKey ed25519.PrivateKey
|
privateKey ed25519.PrivateKey
|
||||||
@@ -28,17 +49,17 @@ type JWTAuthenticator struct {
|
|||||||
func (ja *JWTAuthenticator) Init() error {
|
func (ja *JWTAuthenticator) Init() error {
|
||||||
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
|
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
|
||||||
if pubKey == "" || privKey == "" {
|
if pubKey == "" || privKey == "" {
|
||||||
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||||
} else {
|
} else {
|
||||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Could not decode JWT public key")
|
cclog.Warn("Could not decode JWT public key")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
ja.publicKey = ed25519.PublicKey(bytes)
|
ja.publicKey = ed25519.PublicKey(bytes)
|
||||||
bytes, err = base64.StdEncoding.DecodeString(privKey)
|
bytes, err = base64.StdEncoding.DecodeString(privKey)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Could not decode JWT private key")
|
cclog.Warn("Could not decode JWT private key")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
ja.privateKey = ed25519.PrivateKey(bytes)
|
ja.privateKey = ed25519.PrivateKey(bytes)
|
||||||
@@ -62,7 +83,7 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
|
||||||
if t.Method != jwt.SigningMethodEdDSA {
|
if t.Method != jwt.SigningMethodEdDSA {
|
||||||
return nil, errors.New("only Ed25519/EdDSA supported")
|
return nil, errors.New("only Ed25519/EdDSA supported")
|
||||||
}
|
}
|
||||||
@@ -70,51 +91,34 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
|||||||
return ja.publicKey, nil
|
return ja.publicKey, nil
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Error while parsing JWT token")
|
cclog.Warn("Error while parsing JWT token")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if !token.Valid {
|
if !token.Valid {
|
||||||
log.Warn("jwt token claims are not valid")
|
cclog.Warn("jwt token claims are not valid")
|
||||||
return nil, errors.New("jwt token claims are not valid")
|
return nil, errors.New("jwt token claims are not valid")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Token is valid, extract payload
|
// Token is valid, extract payload
|
||||||
claims := token.Claims.(jwt.MapClaims)
|
claims := token.Claims.(jwt.MapClaims)
|
||||||
sub, _ := claims["sub"].(string)
|
|
||||||
|
|
||||||
var roles []string
|
// Use shared helper to get user from JWT claims
|
||||||
|
var user *schema.User
|
||||||
// Validate user + roles from JWT against database?
|
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthToken, -1)
|
||||||
if config.Keys.JwtConfig.ValidateUser {
|
if err != nil {
|
||||||
ur := repository.GetUserRepository()
|
return nil, err
|
||||||
user, err := ur.GetUser(sub)
|
|
||||||
// Deny any logins for unknown usernames
|
|
||||||
if err != nil {
|
|
||||||
log.Warn("Could not find user from JWT in internal database.")
|
|
||||||
return nil, errors.New("unknown user")
|
|
||||||
}
|
|
||||||
// Take user roles from database instead of trusting the JWT
|
|
||||||
roles = user.Roles
|
|
||||||
} else {
|
|
||||||
// Extract roles from JWT (if present)
|
|
||||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
|
||||||
for _, rr := range rawroles {
|
|
||||||
if r, ok := rr.(string); ok {
|
|
||||||
roles = append(roles, r)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &schema.User{
|
// If not validating user, we only get roles from JWT (no projects for this auth method)
|
||||||
Username: sub,
|
if !Keys.JwtConfig.ValidateUser {
|
||||||
Roles: roles,
|
user.Roles = extractRolesFromClaims(claims, false)
|
||||||
AuthType: schema.AuthToken,
|
user.Projects = nil // Standard JWT auth doesn't include projects
|
||||||
AuthSource: -1,
|
}
|
||||||
}, nil
|
|
||||||
|
return user, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate a new JWT that can be used for authentication
|
// ProvideJWT generates a new JWT that can be used for authentication
|
||||||
func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
|
func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
|
||||||
if ja.privateKey == nil {
|
if ja.privateKey == nil {
|
||||||
return "", errors.New("environment variable 'JWT_PRIVATE_KEY' not set")
|
return "", errors.New("environment variable 'JWT_PRIVATE_KEY' not set")
|
||||||
@@ -126,8 +130,8 @@ func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
|
|||||||
"roles": user.Roles,
|
"roles": user.Roles,
|
||||||
"iat": now.Unix(),
|
"iat": now.Unix(),
|
||||||
}
|
}
|
||||||
if config.Keys.JwtConfig.MaxAge != "" {
|
if Keys.JwtConfig.MaxAge != "" {
|
||||||
d, err := time.ParseDuration(config.Keys.JwtConfig.MaxAge)
|
d, err := time.ParseDuration(Keys.JwtConfig.MaxAge)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", errors.New("cannot parse max-age config key")
|
return "", errors.New("cannot parse max-age config key")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,22 +1,19 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package auth
|
package auth
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"crypto/ed25519"
|
"crypto/ed25519"
|
||||||
"database/sql"
|
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
|
||||||
"github.com/golang-jwt/jwt/v5"
|
"github.com/golang-jwt/jwt/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -31,18 +28,18 @@ var _ Authenticator = (*JWTCookieSessionAuthenticator)(nil)
|
|||||||
func (ja *JWTCookieSessionAuthenticator) Init() error {
|
func (ja *JWTCookieSessionAuthenticator) Init() error {
|
||||||
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
|
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
|
||||||
if pubKey == "" || privKey == "" {
|
if pubKey == "" || privKey == "" {
|
||||||
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||||
return errors.New("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
return errors.New("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||||
} else {
|
} else {
|
||||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Could not decode JWT public key")
|
cclog.Warn("Could not decode JWT public key")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
ja.publicKey = ed25519.PublicKey(bytes)
|
ja.publicKey = ed25519.PublicKey(bytes)
|
||||||
bytes, err = base64.StdEncoding.DecodeString(privKey)
|
bytes, err = base64.StdEncoding.DecodeString(privKey)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Could not decode JWT private key")
|
cclog.Warn("Could not decode JWT private key")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
ja.privateKey = ed25519.PrivateKey(bytes)
|
ja.privateKey = ed25519.PrivateKey(bytes)
|
||||||
@@ -53,36 +50,35 @@ func (ja *JWTCookieSessionAuthenticator) Init() error {
|
|||||||
if keyFound && pubKeyCrossLogin != "" {
|
if keyFound && pubKeyCrossLogin != "" {
|
||||||
bytes, err := base64.StdEncoding.DecodeString(pubKeyCrossLogin)
|
bytes, err := base64.StdEncoding.DecodeString(pubKeyCrossLogin)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Could not decode cross login JWT public key")
|
cclog.Warn("Could not decode cross login JWT public key")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
ja.publicKeyCrossLogin = ed25519.PublicKey(bytes)
|
ja.publicKeyCrossLogin = ed25519.PublicKey(bytes)
|
||||||
} else {
|
} else {
|
||||||
ja.publicKeyCrossLogin = nil
|
ja.publicKeyCrossLogin = nil
|
||||||
log.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
cclog.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
||||||
return errors.New("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
return errors.New("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
||||||
}
|
}
|
||||||
|
|
||||||
jc := config.Keys.JwtConfig
|
|
||||||
// Warn if other necessary settings are not configured
|
// Warn if other necessary settings are not configured
|
||||||
if jc != nil {
|
if Keys.JwtConfig != nil {
|
||||||
if jc.CookieName == "" {
|
if Keys.JwtConfig.CookieName == "" {
|
||||||
log.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
cclog.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
||||||
return errors.New("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
return errors.New("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
||||||
}
|
}
|
||||||
if !jc.ValidateUser {
|
if !Keys.JwtConfig.ValidateUser {
|
||||||
log.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
|
cclog.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
|
||||||
}
|
}
|
||||||
if jc.TrustedIssuer == "" {
|
if Keys.JwtConfig.TrustedIssuer == "" {
|
||||||
log.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
cclog.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
||||||
return errors.New("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
return errors.New("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
log.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
|
cclog.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
|
||||||
return errors.New("config for JWTs not configured (cross login via JWT cookie will fail)")
|
return errors.New("config for JWTs not configured (cross login via JWT cookie will fail)")
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info("JWT Cookie Session authenticator successfully registered")
|
cclog.Info("JWT Cookie Session authenticator successfully registered")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -92,7 +88,7 @@ func (ja *JWTCookieSessionAuthenticator) CanLogin(
|
|||||||
rw http.ResponseWriter,
|
rw http.ResponseWriter,
|
||||||
r *http.Request,
|
r *http.Request,
|
||||||
) (*schema.User, bool) {
|
) (*schema.User, bool) {
|
||||||
jc := config.Keys.JwtConfig
|
jc := Keys.JwtConfig
|
||||||
cookieName := ""
|
cookieName := ""
|
||||||
if jc.CookieName != "" {
|
if jc.CookieName != "" {
|
||||||
cookieName = jc.CookieName
|
cookieName = jc.CookieName
|
||||||
@@ -115,7 +111,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
|||||||
rw http.ResponseWriter,
|
rw http.ResponseWriter,
|
||||||
r *http.Request,
|
r *http.Request,
|
||||||
) (*schema.User, error) {
|
) (*schema.User, error) {
|
||||||
jc := config.Keys.JwtConfig
|
jc := Keys.JwtConfig
|
||||||
jwtCookie, err := r.Cookie(jc.CookieName)
|
jwtCookie, err := r.Cookie(jc.CookieName)
|
||||||
var rawtoken string
|
var rawtoken string
|
||||||
|
|
||||||
@@ -123,7 +119,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
|||||||
rawtoken = jwtCookie.Value
|
rawtoken = jwtCookie.Value
|
||||||
}
|
}
|
||||||
|
|
||||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
|
||||||
if t.Method != jwt.SigningMethodEdDSA {
|
if t.Method != jwt.SigningMethodEdDSA {
|
||||||
return nil, errors.New("only Ed25519/EdDSA supported")
|
return nil, errors.New("only Ed25519/EdDSA supported")
|
||||||
}
|
}
|
||||||
@@ -140,67 +136,26 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
|||||||
return ja.publicKey, nil
|
return ja.publicKey, nil
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("JWT cookie session: error while parsing token")
|
cclog.Warn("JWT cookie session: error while parsing token")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if !token.Valid {
|
if !token.Valid {
|
||||||
log.Warn("jwt token claims are not valid")
|
cclog.Warn("jwt token claims are not valid")
|
||||||
return nil, errors.New("jwt token claims are not valid")
|
return nil, errors.New("jwt token claims are not valid")
|
||||||
}
|
}
|
||||||
|
|
||||||
claims := token.Claims.(jwt.MapClaims)
|
claims := token.Claims.(jwt.MapClaims)
|
||||||
sub, _ := claims["sub"].(string)
|
|
||||||
|
|
||||||
var roles []string
|
// Use shared helper to get user from JWT claims
|
||||||
projects := make([]string, 0)
|
user, err = getUserFromJWT(claims, jc.ValidateUser, schema.AuthSession, schema.AuthViaToken)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
if jc.ValidateUser {
|
// Sync or update user if configured
|
||||||
var err error
|
if !jc.ValidateUser && (jc.SyncUserOnLogin || jc.UpdateUserOnLogin) {
|
||||||
user, err = repository.GetUserRepository().GetUser(sub)
|
handleTokenUser(user)
|
||||||
if err != nil && err != sql.ErrNoRows {
|
|
||||||
log.Errorf("Error while loading user '%v'", sub)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Deny any logins for unknown usernames
|
|
||||||
if user == nil {
|
|
||||||
log.Warn("Could not find user from JWT in internal database.")
|
|
||||||
return nil, errors.New("unknown user")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
var name string
|
|
||||||
if wrap, ok := claims["name"].(map[string]interface{}); ok {
|
|
||||||
if vals, ok := wrap["values"].([]interface{}); ok {
|
|
||||||
if len(vals) != 0 {
|
|
||||||
name = fmt.Sprintf("%v", vals[0])
|
|
||||||
|
|
||||||
for i := 1; i < len(vals); i++ {
|
|
||||||
name += fmt.Sprintf(" %v", vals[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract roles from JWT (if present)
|
|
||||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
|
||||||
for _, rr := range rawroles {
|
|
||||||
if r, ok := rr.(string); ok {
|
|
||||||
roles = append(roles, r)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
user = &schema.User{
|
|
||||||
Username: sub,
|
|
||||||
Name: name,
|
|
||||||
Roles: roles,
|
|
||||||
Projects: projects,
|
|
||||||
AuthType: schema.AuthSession,
|
|
||||||
AuthSource: schema.AuthViaToken,
|
|
||||||
}
|
|
||||||
|
|
||||||
if jc.SyncUserOnLogin {
|
|
||||||
persistUser(user)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// (Ask browser to) Delete JWT cookie
|
// (Ask browser to) Delete JWT cookie
|
||||||
|
|||||||
138
internal/auth/jwtHelpers.go
Normal file
138
internal/auth/jwtHelpers.go
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package auth
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
"github.com/golang-jwt/jwt/v5"
|
||||||
|
)
|
||||||
|
|
||||||
|
// extractStringFromClaims extracts a string value from JWT claims
|
||||||
|
func extractStringFromClaims(claims jwt.MapClaims, key string) string {
|
||||||
|
if val, ok := claims[key].(string); ok {
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractRolesFromClaims extracts roles from JWT claims
|
||||||
|
// If validateRoles is true, only valid roles are returned
|
||||||
|
func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
|
||||||
|
var roles []string
|
||||||
|
|
||||||
|
if rawroles, ok := claims["roles"].([]any); ok {
|
||||||
|
for _, rr := range rawroles {
|
||||||
|
if r, ok := rr.(string); ok {
|
||||||
|
if validateRoles {
|
||||||
|
if schema.IsValidRole(r) {
|
||||||
|
roles = append(roles, r)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
roles = append(roles, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return roles
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractProjectsFromClaims extracts projects from JWT claims
|
||||||
|
func extractProjectsFromClaims(claims jwt.MapClaims) []string {
|
||||||
|
projects := make([]string, 0)
|
||||||
|
|
||||||
|
if rawprojs, ok := claims["projects"].([]any); ok {
|
||||||
|
for _, pp := range rawprojs {
|
||||||
|
if p, ok := pp.(string); ok {
|
||||||
|
projects = append(projects, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if rawprojs, ok := claims["projects"]; ok {
|
||||||
|
if projSlice, ok := rawprojs.([]string); ok {
|
||||||
|
projects = append(projects, projSlice...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return projects
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractNameFromClaims extracts name from JWT claims
|
||||||
|
// Handles both simple string and complex nested structure
|
||||||
|
func extractNameFromClaims(claims jwt.MapClaims) string {
|
||||||
|
// Try simple string first
|
||||||
|
if name, ok := claims["name"].(string); ok {
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try nested structure: {name: {values: [...]}}
|
||||||
|
if wrap, ok := claims["name"].(map[string]any); ok {
|
||||||
|
if vals, ok := wrap["values"].([]any); ok {
|
||||||
|
if len(vals) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
var name strings.Builder
|
||||||
|
name.WriteString(fmt.Sprintf("%v", vals[0]))
|
||||||
|
for i := 1; i < len(vals); i++ {
|
||||||
|
name.WriteString(fmt.Sprintf(" %v", vals[i]))
|
||||||
|
}
|
||||||
|
return name.String()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// getUserFromJWT creates or retrieves a user based on JWT claims
|
||||||
|
// If validateUser is true, the user must exist in the database
|
||||||
|
// Otherwise, a new user object is created from claims
|
||||||
|
// authSource should be a schema.AuthSource constant (like schema.AuthViaToken)
|
||||||
|
func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.AuthType, authSource schema.AuthSource) (*schema.User, error) {
|
||||||
|
sub := extractStringFromClaims(claims, "sub")
|
||||||
|
if sub == "" {
|
||||||
|
return nil, errors.New("missing 'sub' claim in JWT")
|
||||||
|
}
|
||||||
|
|
||||||
|
if validateUser {
|
||||||
|
// Validate user against database
|
||||||
|
ur := repository.GetUserRepository()
|
||||||
|
user, err := ur.GetUser(sub)
|
||||||
|
if err != nil && err != sql.ErrNoRows {
|
||||||
|
cclog.Errorf("Error while loading user '%v': %v", sub, err)
|
||||||
|
return nil, fmt.Errorf("database error: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deny any logins for unknown usernames
|
||||||
|
if user == nil || err == sql.ErrNoRows {
|
||||||
|
cclog.Warn("Could not find user from JWT in internal database.")
|
||||||
|
return nil, errors.New("unknown user")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return database user (with database roles)
|
||||||
|
return user, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create user from JWT claims
|
||||||
|
name := extractNameFromClaims(claims)
|
||||||
|
roles := extractRolesFromClaims(claims, true) // Validate roles
|
||||||
|
projects := extractProjectsFromClaims(claims)
|
||||||
|
|
||||||
|
return &schema.User{
|
||||||
|
Username: sub,
|
||||||
|
Name: name,
|
||||||
|
Roles: roles,
|
||||||
|
Projects: projects,
|
||||||
|
AuthType: authType,
|
||||||
|
AuthSource: authSource,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
280
internal/auth/jwtHelpers_test.go
Normal file
280
internal/auth/jwtHelpers_test.go
Normal file
@@ -0,0 +1,280 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package auth
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
|
"github.com/golang-jwt/jwt/v5"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestExtractStringFromClaims tests extracting string values from JWT claims
|
||||||
|
func TestExtractStringFromClaims(t *testing.T) {
|
||||||
|
claims := jwt.MapClaims{
|
||||||
|
"sub": "testuser",
|
||||||
|
"email": "test@example.com",
|
||||||
|
"age": 25, // not a string
|
||||||
|
}
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
key string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{"Existing string", "sub", "testuser"},
|
||||||
|
{"Another string", "email", "test@example.com"},
|
||||||
|
{"Non-existent key", "missing", ""},
|
||||||
|
{"Non-string value", "age", ""},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := extractStringFromClaims(claims, tt.key)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestExtractRolesFromClaims tests role extraction and validation
|
||||||
|
func TestExtractRolesFromClaims(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
claims jwt.MapClaims
|
||||||
|
validateRoles bool
|
||||||
|
expected []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Valid roles without validation",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"roles": []any{"admin", "user", "invalid_role"},
|
||||||
|
},
|
||||||
|
validateRoles: false,
|
||||||
|
expected: []string{"admin", "user", "invalid_role"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Valid roles with validation",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"roles": []any{"admin", "user", "api"},
|
||||||
|
},
|
||||||
|
validateRoles: true,
|
||||||
|
expected: []string{"admin", "user", "api"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Invalid roles with validation",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"roles": []any{"invalid_role", "fake_role"},
|
||||||
|
},
|
||||||
|
validateRoles: true,
|
||||||
|
expected: []string{}, // Should filter out invalid roles
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "No roles claim",
|
||||||
|
claims: jwt.MapClaims{},
|
||||||
|
validateRoles: false,
|
||||||
|
expected: []string{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Non-array roles",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"roles": "admin",
|
||||||
|
},
|
||||||
|
validateRoles: false,
|
||||||
|
expected: []string{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := extractRolesFromClaims(tt.claims, tt.validateRoles)
|
||||||
|
|
||||||
|
if len(result) != len(tt.expected) {
|
||||||
|
t.Errorf("Expected %d roles, got %d", len(tt.expected), len(result))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, role := range result {
|
||||||
|
if i >= len(tt.expected) || role != tt.expected[i] {
|
||||||
|
t.Errorf("Expected role %s at position %d, got %s", tt.expected[i], i, role)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestExtractProjectsFromClaims tests project extraction from claims
|
||||||
|
func TestExtractProjectsFromClaims(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
claims jwt.MapClaims
|
||||||
|
expected []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Projects as array of interfaces",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"projects": []any{"project1", "project2", "project3"},
|
||||||
|
},
|
||||||
|
expected: []string{"project1", "project2", "project3"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Projects as string array",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"projects": []string{"projectA", "projectB"},
|
||||||
|
},
|
||||||
|
expected: []string{"projectA", "projectB"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "No projects claim",
|
||||||
|
claims: jwt.MapClaims{},
|
||||||
|
expected: []string{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Mixed types in projects array",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"projects": []any{"project1", 123, "project2"},
|
||||||
|
},
|
||||||
|
expected: []string{"project1", "project2"}, // Should skip non-strings
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := extractProjectsFromClaims(tt.claims)
|
||||||
|
|
||||||
|
if len(result) != len(tt.expected) {
|
||||||
|
t.Errorf("Expected %d projects, got %d", len(tt.expected), len(result))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, project := range result {
|
||||||
|
if i >= len(tt.expected) || project != tt.expected[i] {
|
||||||
|
t.Errorf("Expected project %s at position %d, got %s", tt.expected[i], i, project)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestExtractNameFromClaims tests name extraction from various formats
|
||||||
|
func TestExtractNameFromClaims(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
claims jwt.MapClaims
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Simple string name",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"name": "John Doe",
|
||||||
|
},
|
||||||
|
expected: "John Doe",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Nested name structure",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"name": map[string]any{
|
||||||
|
"values": []any{"John", "Doe"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expected: "John Doe",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Nested name with single value",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"name": map[string]any{
|
||||||
|
"values": []any{"Alice"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expected: "Alice",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "No name claim",
|
||||||
|
claims: jwt.MapClaims{},
|
||||||
|
expected: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Empty nested values",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"name": map[string]any{
|
||||||
|
"values": []any{},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expected: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Nested with non-string values",
|
||||||
|
claims: jwt.MapClaims{
|
||||||
|
"name": map[string]any{
|
||||||
|
"values": []any{123, "Smith"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expected: "123 Smith", // Should convert to string
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := extractNameFromClaims(tt.claims)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("Expected '%s', got '%s'", tt.expected, result)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGetUserFromJWT_NoValidation tests getUserFromJWT without database validation
|
||||||
|
func TestGetUserFromJWT_NoValidation(t *testing.T) {
|
||||||
|
claims := jwt.MapClaims{
|
||||||
|
"sub": "testuser",
|
||||||
|
"name": "Test User",
|
||||||
|
"roles": []any{"user", "admin"},
|
||||||
|
"projects": []any{"project1", "project2"},
|
||||||
|
}
|
||||||
|
|
||||||
|
user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if user.Username != "testuser" {
|
||||||
|
t.Errorf("Expected username 'testuser', got '%s'", user.Username)
|
||||||
|
}
|
||||||
|
|
||||||
|
if user.Name != "Test User" {
|
||||||
|
t.Errorf("Expected name 'Test User', got '%s'", user.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(user.Roles) != 2 {
|
||||||
|
t.Errorf("Expected 2 roles, got %d", len(user.Roles))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(user.Projects) != 2 {
|
||||||
|
t.Errorf("Expected 2 projects, got %d", len(user.Projects))
|
||||||
|
}
|
||||||
|
|
||||||
|
if user.AuthType != schema.AuthToken {
|
||||||
|
t.Errorf("Expected AuthType %v, got %v", schema.AuthToken, user.AuthType)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGetUserFromJWT_MissingSub tests error when sub claim is missing
|
||||||
|
func TestGetUserFromJWT_MissingSub(t *testing.T) {
|
||||||
|
claims := jwt.MapClaims{
|
||||||
|
"name": "Test User",
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
t.Error("Expected error for missing sub claim")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err.Error() != "missing 'sub' claim in JWT" {
|
||||||
|
t.Errorf("Expected specific error message, got: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,11 +1,11 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package auth
|
package auth
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"database/sql"
|
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -13,10 +13,8 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
|
||||||
"github.com/golang-jwt/jwt/v5"
|
"github.com/golang-jwt/jwt/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -30,13 +28,13 @@ func (ja *JWTSessionAuthenticator) Init() error {
|
|||||||
if pubKey := os.Getenv("CROSS_LOGIN_JWT_HS512_KEY"); pubKey != "" {
|
if pubKey := os.Getenv("CROSS_LOGIN_JWT_HS512_KEY"); pubKey != "" {
|
||||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Could not decode cross login JWT HS512 key")
|
cclog.Warn("Could not decode cross login JWT HS512 key")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
ja.loginTokenKey = bytes
|
ja.loginTokenKey = bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info("JWT Session authenticator successfully registered")
|
cclog.Info("JWT Session authenticator successfully registered")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -60,87 +58,33 @@ func (ja *JWTSessionAuthenticator) Login(
|
|||||||
rawtoken = r.URL.Query().Get("login-token")
|
rawtoken = r.URL.Query().Get("login-token")
|
||||||
}
|
}
|
||||||
|
|
||||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
|
||||||
if t.Method == jwt.SigningMethodHS256 || t.Method == jwt.SigningMethodHS512 {
|
if t.Method == jwt.SigningMethodHS256 || t.Method == jwt.SigningMethodHS512 {
|
||||||
return ja.loginTokenKey, nil
|
return ja.loginTokenKey, nil
|
||||||
}
|
}
|
||||||
return nil, fmt.Errorf("unkown signing method for login token: %s (known: HS256, HS512, EdDSA)", t.Method.Alg())
|
return nil, fmt.Errorf("unkown signing method for login token: %s (known: HS256, HS512, EdDSA)", t.Method.Alg())
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Error while parsing jwt token")
|
cclog.Warn("Error while parsing jwt token")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if !token.Valid {
|
if !token.Valid {
|
||||||
log.Warn("jwt token claims are not valid")
|
cclog.Warn("jwt token claims are not valid")
|
||||||
return nil, errors.New("jwt token claims are not valid")
|
return nil, errors.New("jwt token claims are not valid")
|
||||||
}
|
}
|
||||||
|
|
||||||
claims := token.Claims.(jwt.MapClaims)
|
claims := token.Claims.(jwt.MapClaims)
|
||||||
sub, _ := claims["sub"].(string)
|
|
||||||
|
|
||||||
var roles []string
|
// Use shared helper to get user from JWT claims
|
||||||
projects := make([]string, 0)
|
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthSession, schema.AuthViaToken)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
if config.Keys.JwtConfig.ValidateUser {
|
// Sync or update user if configured
|
||||||
var err error
|
if !Keys.JwtConfig.ValidateUser && (Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin) {
|
||||||
user, err = repository.GetUserRepository().GetUser(sub)
|
handleTokenUser(user)
|
||||||
if err != nil && err != sql.ErrNoRows {
|
|
||||||
log.Errorf("Error while loading user '%v'", sub)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Deny any logins for unknown usernames
|
|
||||||
if user == nil {
|
|
||||||
log.Warn("Could not find user from JWT in internal database.")
|
|
||||||
return nil, errors.New("unknown user")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
var name string
|
|
||||||
if wrap, ok := claims["name"].(map[string]interface{}); ok {
|
|
||||||
if vals, ok := wrap["values"].([]interface{}); ok {
|
|
||||||
if len(vals) != 0 {
|
|
||||||
name = fmt.Sprintf("%v", vals[0])
|
|
||||||
|
|
||||||
for i := 1; i < len(vals); i++ {
|
|
||||||
name += fmt.Sprintf(" %v", vals[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract roles from JWT (if present)
|
|
||||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
|
||||||
for _, rr := range rawroles {
|
|
||||||
if r, ok := rr.(string); ok {
|
|
||||||
if schema.IsValidRole(r) {
|
|
||||||
roles = append(roles, r)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if rawprojs, ok := claims["projects"].([]interface{}); ok {
|
|
||||||
for _, pp := range rawprojs {
|
|
||||||
if p, ok := pp.(string); ok {
|
|
||||||
projects = append(projects, p)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if rawprojs, ok := claims["projects"]; ok {
|
|
||||||
projects = append(projects, rawprojs.([]string)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
user = &schema.User{
|
|
||||||
Username: sub,
|
|
||||||
Name: name,
|
|
||||||
Roles: roles,
|
|
||||||
Projects: projects,
|
|
||||||
AuthType: schema.AuthSession,
|
|
||||||
AuthSource: schema.AuthViaToken,
|
|
||||||
}
|
|
||||||
|
|
||||||
if config.Keys.JwtConfig.SyncUserOnLogin {
|
|
||||||
persistUser(user)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return user, nil
|
return user, nil
|
||||||
|
|||||||
@@ -1,27 +1,44 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package auth
|
package auth
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
"github.com/go-ldap/ldap/v3"
|
"github.com/go-ldap/ldap/v3"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type LdapConfig struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
UserBase string `json:"user-base"`
|
||||||
|
SearchDN string `json:"search-dn"`
|
||||||
|
UserBind string `json:"user-bind"`
|
||||||
|
UserFilter string `json:"user-filter"`
|
||||||
|
UserAttr string `json:"username-attr"`
|
||||||
|
UIDAttr string `json:"uid-attr"`
|
||||||
|
SyncInterval string `json:"sync-interval"` // Parsed using time.ParseDuration.
|
||||||
|
SyncDelOldUsers bool `json:"sync-del-old-users"`
|
||||||
|
|
||||||
|
// Should a non-existent user be added to the DB if user exists in ldap directory
|
||||||
|
SyncUserOnLogin bool `json:"sync-user-on-login"`
|
||||||
|
UpdateUserOnLogin bool `json:"update-user-on-login"`
|
||||||
|
}
|
||||||
|
|
||||||
type LdapAuthenticator struct {
|
type LdapAuthenticator struct {
|
||||||
syncPassword string
|
syncPassword string
|
||||||
UserAttr string
|
UserAttr string
|
||||||
|
UIDAttr string
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ Authenticator = (*LdapAuthenticator)(nil)
|
var _ Authenticator = (*LdapAuthenticator)(nil)
|
||||||
@@ -29,44 +46,21 @@ var _ Authenticator = (*LdapAuthenticator)(nil)
|
|||||||
func (la *LdapAuthenticator) Init() error {
|
func (la *LdapAuthenticator) Init() error {
|
||||||
la.syncPassword = os.Getenv("LDAP_ADMIN_PASSWORD")
|
la.syncPassword = os.Getenv("LDAP_ADMIN_PASSWORD")
|
||||||
if la.syncPassword == "" {
|
if la.syncPassword == "" {
|
||||||
log.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
|
cclog.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
|
||||||
}
|
}
|
||||||
|
|
||||||
lc := config.Keys.LdapConfig
|
if Keys.LdapConfig.UserAttr != "" {
|
||||||
|
la.UserAttr = Keys.LdapConfig.UserAttr
|
||||||
if lc.SyncInterval != "" {
|
|
||||||
interval, err := time.ParseDuration(lc.SyncInterval)
|
|
||||||
if err != nil {
|
|
||||||
log.Warnf("Could not parse duration for sync interval: %v",
|
|
||||||
lc.SyncInterval)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if interval == 0 {
|
|
||||||
log.Info("Sync interval is zero")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
ticker := time.NewTicker(interval)
|
|
||||||
for t := range ticker.C {
|
|
||||||
log.Printf("sync started at %s", t.Format(time.RFC3339))
|
|
||||||
if err := la.Sync(); err != nil {
|
|
||||||
log.Errorf("sync failed: %s", err.Error())
|
|
||||||
}
|
|
||||||
log.Print("sync done")
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
} else {
|
|
||||||
log.Info("LDAP configuration key sync_interval invalid")
|
|
||||||
}
|
|
||||||
|
|
||||||
if lc.UserAttr != "" {
|
|
||||||
la.UserAttr = lc.UserAttr
|
|
||||||
} else {
|
} else {
|
||||||
la.UserAttr = "gecos"
|
la.UserAttr = "gecos"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if Keys.LdapConfig.UIDAttr != "" {
|
||||||
|
la.UIDAttr = Keys.LdapConfig.UIDAttr
|
||||||
|
} else {
|
||||||
|
la.UIDAttr = "uid"
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -76,60 +70,50 @@ func (la *LdapAuthenticator) CanLogin(
|
|||||||
rw http.ResponseWriter,
|
rw http.ResponseWriter,
|
||||||
r *http.Request,
|
r *http.Request,
|
||||||
) (*schema.User, bool) {
|
) (*schema.User, bool) {
|
||||||
lc := config.Keys.LdapConfig
|
lc := Keys.LdapConfig
|
||||||
|
|
||||||
if user != nil {
|
if user != nil {
|
||||||
if user.AuthSource == schema.AuthViaLDAP {
|
if user.AuthSource == schema.AuthViaLDAP {
|
||||||
return user, true
|
return user, true
|
||||||
}
|
}
|
||||||
} else {
|
} else if lc.SyncUserOnLogin {
|
||||||
if lc.SyncUserOnLogin {
|
l, err := la.getLdapConnection(true)
|
||||||
l, err := la.getLdapConnection(true)
|
if err != nil {
|
||||||
if err != nil {
|
cclog.Error("LDAP connection error")
|
||||||
log.Error("LDAP connection error")
|
return nil, false
|
||||||
}
|
|
||||||
defer l.Close()
|
|
||||||
|
|
||||||
// Search for the given username
|
|
||||||
searchRequest := ldap.NewSearchRequest(
|
|
||||||
lc.UserBase,
|
|
||||||
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
|
||||||
fmt.Sprintf("(&%s(uid=%s))", lc.UserFilter, username),
|
|
||||||
[]string{"dn", "uid", la.UserAttr}, nil)
|
|
||||||
|
|
||||||
sr, err := l.Search(searchRequest)
|
|
||||||
if err != nil {
|
|
||||||
log.Warn(err)
|
|
||||||
return nil, false
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(sr.Entries) != 1 {
|
|
||||||
log.Warn("LDAP: User does not exist or too many entries returned")
|
|
||||||
return nil, false
|
|
||||||
}
|
|
||||||
|
|
||||||
entry := sr.Entries[0]
|
|
||||||
name := entry.GetAttributeValue(la.UserAttr)
|
|
||||||
var roles []string
|
|
||||||
roles = append(roles, schema.GetRoleString(schema.RoleUser))
|
|
||||||
projects := make([]string, 0)
|
|
||||||
|
|
||||||
user = &schema.User{
|
|
||||||
Username: username,
|
|
||||||
Name: name,
|
|
||||||
Roles: roles,
|
|
||||||
Projects: projects,
|
|
||||||
AuthType: schema.AuthSession,
|
|
||||||
AuthSource: schema.AuthViaLDAP,
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := repository.GetUserRepository().AddUser(user); err != nil {
|
|
||||||
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
|
||||||
return nil, false
|
|
||||||
}
|
|
||||||
|
|
||||||
return user, true
|
|
||||||
}
|
}
|
||||||
|
defer l.Close()
|
||||||
|
|
||||||
|
// Search for the given username
|
||||||
|
searchRequest := ldap.NewSearchRequest(
|
||||||
|
lc.UserBase,
|
||||||
|
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
||||||
|
fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UIDAttr, ldap.EscapeFilter(username)),
|
||||||
|
[]string{"dn", la.UIDAttr, la.UserAttr}, nil)
|
||||||
|
|
||||||
|
sr, err := l.Search(searchRequest)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Warn(err)
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(sr.Entries) != 1 {
|
||||||
|
cclog.Warn("LDAP: User does not exist or too many entries returned")
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
entry := sr.Entries[0]
|
||||||
|
user = &schema.User{
|
||||||
|
Username: username,
|
||||||
|
Name: entry.GetAttributeValue(la.UserAttr),
|
||||||
|
Roles: []string{schema.GetRoleString(schema.RoleUser)},
|
||||||
|
Projects: make([]string, 0),
|
||||||
|
AuthType: schema.AuthSession,
|
||||||
|
AuthSource: schema.AuthViaLDAP,
|
||||||
|
}
|
||||||
|
|
||||||
|
handleLdapUser(user)
|
||||||
|
return user, true
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, false
|
return nil, false
|
||||||
@@ -142,14 +126,14 @@ func (la *LdapAuthenticator) Login(
|
|||||||
) (*schema.User, error) {
|
) (*schema.User, error) {
|
||||||
l, err := la.getLdapConnection(false)
|
l, err := la.getLdapConnection(false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Error while getting ldap connection")
|
cclog.Warn("Error while getting ldap connection")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer l.Close()
|
defer l.Close()
|
||||||
|
|
||||||
userDn := strings.Replace(config.Keys.LdapConfig.UserBind, "{username}", user.Username, -1)
|
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", ldap.EscapeDN(user.Username))
|
||||||
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
|
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
|
||||||
log.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
|
cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
|
||||||
user.Username, err)
|
user.Username, err)
|
||||||
return nil, fmt.Errorf("Authentication failed")
|
return nil, fmt.Errorf("Authentication failed")
|
||||||
}
|
}
|
||||||
@@ -158,11 +142,11 @@ func (la *LdapAuthenticator) Login(
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (la *LdapAuthenticator) Sync() error {
|
func (la *LdapAuthenticator) Sync() error {
|
||||||
const IN_DB int = 1
|
const InDB int = 1
|
||||||
const IN_LDAP int = 2
|
const InLdap int = 2
|
||||||
const IN_BOTH int = 3
|
const InBoth int = 3
|
||||||
ur := repository.GetUserRepository()
|
ur := repository.GetUserRepository()
|
||||||
lc := config.Keys.LdapConfig
|
lc := Keys.LdapConfig
|
||||||
|
|
||||||
users := map[string]int{}
|
users := map[string]int{}
|
||||||
usernames, err := ur.GetLdapUsernames()
|
usernames, err := ur.GetLdapUsernames()
|
||||||
@@ -171,12 +155,12 @@ func (la *LdapAuthenticator) Sync() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, username := range usernames {
|
for _, username := range usernames {
|
||||||
users[username] = IN_DB
|
users[username] = InDB
|
||||||
}
|
}
|
||||||
|
|
||||||
l, err := la.getLdapConnection(true)
|
l, err := la.getLdapConnection(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("LDAP connection error")
|
cclog.Error("LDAP connection error")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer l.Close()
|
defer l.Close()
|
||||||
@@ -185,50 +169,49 @@ func (la *LdapAuthenticator) Sync() error {
|
|||||||
lc.UserBase,
|
lc.UserBase,
|
||||||
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
||||||
lc.UserFilter,
|
lc.UserFilter,
|
||||||
[]string{"dn", "uid", la.UserAttr}, nil))
|
[]string{"dn", la.UIDAttr, la.UserAttr}, nil))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("LDAP search error")
|
cclog.Warn("LDAP search error")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
newnames := map[string]string{}
|
newnames := map[string]string{}
|
||||||
for _, entry := range ldapResults.Entries {
|
for _, entry := range ldapResults.Entries {
|
||||||
username := entry.GetAttributeValue("uid")
|
username := entry.GetAttributeValue(la.UIDAttr)
|
||||||
if username == "" {
|
if username == "" {
|
||||||
return errors.New("no attribute 'uid'")
|
return fmt.Errorf("no attribute '%s'", la.UIDAttr)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, ok := users[username]
|
_, ok := users[username]
|
||||||
if !ok {
|
if !ok {
|
||||||
users[username] = IN_LDAP
|
users[username] = InLdap
|
||||||
newnames[username] = entry.GetAttributeValue(la.UserAttr)
|
newnames[username] = entry.GetAttributeValue(la.UserAttr)
|
||||||
} else {
|
} else {
|
||||||
users[username] = IN_BOTH
|
users[username] = InBoth
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for username, where := range users {
|
for username, where := range users {
|
||||||
if where == IN_DB && lc.SyncDelOldUsers {
|
if where == InDB && lc.SyncDelOldUsers {
|
||||||
ur.DelUser(username)
|
if err := ur.DelUser(username); err != nil {
|
||||||
log.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
|
cclog.Errorf("User '%s' LDAP: Delete from DB failed: %v", username, err)
|
||||||
} else if where == IN_LDAP {
|
return err
|
||||||
|
}
|
||||||
|
cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
|
||||||
|
} else if where == InLdap {
|
||||||
name := newnames[username]
|
name := newnames[username]
|
||||||
|
|
||||||
var roles []string
|
|
||||||
roles = append(roles, schema.GetRoleString(schema.RoleUser))
|
|
||||||
projects := make([]string, 0)
|
|
||||||
|
|
||||||
user := &schema.User{
|
user := &schema.User{
|
||||||
Username: username,
|
Username: username,
|
||||||
Name: name,
|
Name: name,
|
||||||
Roles: roles,
|
Roles: []string{schema.GetRoleString(schema.RoleUser)},
|
||||||
Projects: projects,
|
Projects: make([]string, 0),
|
||||||
AuthSource: schema.AuthViaLDAP,
|
AuthSource: schema.AuthViaLDAP,
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
|
cclog.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
|
||||||
if err := ur.AddUser(user); err != nil {
|
if err := ur.AddUser(user); err != nil {
|
||||||
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -238,17 +221,19 @@ func (la *LdapAuthenticator) Sync() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
|
func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
|
||||||
lc := config.Keys.LdapConfig
|
lc := Keys.LdapConfig
|
||||||
conn, err := ldap.DialURL(lc.Url)
|
conn, err := ldap.DialURL(lc.URL,
|
||||||
|
ldap.DialWithDialer(&net.Dialer{Timeout: 10 * time.Second}))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("LDAP URL dial failed")
|
cclog.Warn("LDAP URL dial failed")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
conn.SetTimeout(30 * time.Second)
|
||||||
|
|
||||||
if admin {
|
if admin {
|
||||||
if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {
|
if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {
|
||||||
conn.Close()
|
conn.Close()
|
||||||
log.Warn("LDAP connection bind failed")
|
cclog.Warn("LDAP connection bind failed")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,15 +1,16 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package auth
|
package auth
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
"golang.org/x/crypto/bcrypt"
|
"golang.org/x/crypto/bcrypt"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -27,19 +28,19 @@ func (la *LocalAuthenticator) CanLogin(
|
|||||||
user *schema.User,
|
user *schema.User,
|
||||||
username string,
|
username string,
|
||||||
rw http.ResponseWriter,
|
rw http.ResponseWriter,
|
||||||
r *http.Request) (*schema.User, bool) {
|
r *http.Request,
|
||||||
|
) (*schema.User, bool) {
|
||||||
return user, user != nil && user.AuthSource == schema.AuthViaLocalPassword
|
return user, user != nil && user.AuthSource == schema.AuthViaLocalPassword
|
||||||
}
|
}
|
||||||
|
|
||||||
func (la *LocalAuthenticator) Login(
|
func (la *LocalAuthenticator) Login(
|
||||||
user *schema.User,
|
user *schema.User,
|
||||||
rw http.ResponseWriter,
|
rw http.ResponseWriter,
|
||||||
r *http.Request) (*schema.User, error) {
|
r *http.Request,
|
||||||
|
) (*schema.User, error) {
|
||||||
if e := bcrypt.CompareHashAndPassword([]byte(user.Password),
|
if e := bcrypt.CompareHashAndPassword([]byte(user.Password),
|
||||||
[]byte(r.FormValue("password"))); e != nil {
|
[]byte(r.FormValue("password"))); e != nil {
|
||||||
log.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
|
cclog.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
|
||||||
return nil, fmt.Errorf("Authentication failed")
|
return nil, fmt.Errorf("Authentication failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,27 +1,34 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package auth
|
package auth
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"crypto/rand"
|
"crypto/rand"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
"github.com/coreos/go-oidc/v3/oidc"
|
"github.com/coreos/go-oidc/v3/oidc"
|
||||||
"github.com/gorilla/mux"
|
"github.com/go-chi/chi/v5"
|
||||||
"golang.org/x/oauth2"
|
"golang.org/x/oauth2"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type OpenIDConfig struct {
|
||||||
|
Provider string `json:"provider"`
|
||||||
|
SyncUserOnLogin bool `json:"sync-user-on-login"`
|
||||||
|
UpdateUserOnLogin bool `json:"update-user-on-login"`
|
||||||
|
}
|
||||||
|
|
||||||
type OIDC struct {
|
type OIDC struct {
|
||||||
client *oauth2.Config
|
client *oauth2.Config
|
||||||
provider *oidc.Provider
|
provider *oidc.Provider
|
||||||
@@ -44,30 +51,35 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin
|
|||||||
MaxAge: int(time.Hour.Seconds()),
|
MaxAge: int(time.Hour.Seconds()),
|
||||||
Secure: r.TLS != nil,
|
Secure: r.TLS != nil,
|
||||||
HttpOnly: true,
|
HttpOnly: true,
|
||||||
|
SameSite: http.SameSiteLaxMode,
|
||||||
}
|
}
|
||||||
http.SetCookie(w, c)
|
http.SetCookie(w, c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewOIDC creates a new OIDC authenticator with the configured provider
|
||||||
func NewOIDC(a *Authentication) *OIDC {
|
func NewOIDC(a *Authentication) *OIDC {
|
||||||
provider, err := oidc.NewProvider(context.Background(), config.Keys.OpenIDConfig.Provider)
|
// Use context with timeout for provider initialization
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
provider, err := oidc.NewProvider(ctx, Keys.OpenIDConfig.Provider)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
cclog.Fatal(err)
|
||||||
}
|
}
|
||||||
clientID := os.Getenv("OID_CLIENT_ID")
|
clientID := os.Getenv("OID_CLIENT_ID")
|
||||||
if clientID == "" {
|
if clientID == "" {
|
||||||
log.Warn("environment variable 'OID_CLIENT_ID' not set (Open ID connect auth will not work)")
|
cclog.Warn("environment variable 'OID_CLIENT_ID' not set (Open ID connect auth will not work)")
|
||||||
}
|
}
|
||||||
clientSecret := os.Getenv("OID_CLIENT_SECRET")
|
clientSecret := os.Getenv("OID_CLIENT_SECRET")
|
||||||
if clientSecret == "" {
|
if clientSecret == "" {
|
||||||
log.Warn("environment variable 'OID_CLIENT_SECRET' not set (Open ID connect auth will not work)")
|
cclog.Warn("environment variable 'OID_CLIENT_SECRET' not set (Open ID connect auth will not work)")
|
||||||
}
|
}
|
||||||
|
|
||||||
client := &oauth2.Config{
|
client := &oauth2.Config{
|
||||||
ClientID: clientID,
|
ClientID: clientID,
|
||||||
ClientSecret: clientSecret,
|
ClientSecret: clientSecret,
|
||||||
Endpoint: provider.Endpoint(),
|
Endpoint: provider.Endpoint(),
|
||||||
RedirectURL: "oidc-callback",
|
Scopes: []string{oidc.ScopeOpenID, "profile"},
|
||||||
Scopes: []string{oidc.ScopeOpenID, "profile", "email"},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a}
|
oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a}
|
||||||
@@ -75,7 +87,7 @@ func NewOIDC(a *Authentication) *OIDC {
|
|||||||
return oa
|
return oa
|
||||||
}
|
}
|
||||||
|
|
||||||
func (oa *OIDC) RegisterEndpoints(r *mux.Router) {
|
func (oa *OIDC) RegisterEndpoints(r chi.Router) {
|
||||||
r.HandleFunc("/oidc-login", oa.OAuth2Login)
|
r.HandleFunc("/oidc-login", oa.OAuth2Login)
|
||||||
r.HandleFunc("/oidc-callback", oa.OAuth2Callback)
|
r.HandleFunc("/oidc-callback", oa.OAuth2Callback)
|
||||||
}
|
}
|
||||||
@@ -105,55 +117,99 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
|||||||
http.Error(rw, "Code not found", http.StatusBadRequest)
|
http.Error(rw, "Code not found", http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
token, err := oa.client.Exchange(context.Background(), code, oauth2.VerifierOption(codeVerifier))
|
// Exchange authorization code for token with timeout
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError)
|
cclog.Errorf("token exchange failed: %s", err.Error())
|
||||||
|
http.Error(rw, "Authentication failed during token exchange", http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
userInfo, err := oa.provider.UserInfo(context.Background(), oauth2.StaticTokenSource(token))
|
// Get user info from OIDC provider with same timeout
|
||||||
|
userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError)
|
cclog.Errorf("failed to get userinfo: %s", err.Error())
|
||||||
|
http.Error(rw, "Failed to retrieve user information", http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// // Extract the ID Token from OAuth2 token.
|
// Verify ID token and nonce to prevent replay attacks
|
||||||
// rawIDToken, ok := token.Extra("id_token").(string)
|
rawIDToken, ok := token.Extra("id_token").(string)
|
||||||
// if !ok {
|
if !ok {
|
||||||
// http.Error(rw, "Cannot access idToken", http.StatusInternalServerError)
|
http.Error(rw, "ID token not found in response", http.StatusInternalServerError)
|
||||||
// }
|
return
|
||||||
//
|
}
|
||||||
// verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
|
|
||||||
// // Parse and verify ID Token payload.
|
nonceCookie, err := r.Cookie("nonce")
|
||||||
// idToken, err := verifier.Verify(context.Background(), rawIDToken)
|
if err != nil {
|
||||||
// if err != nil {
|
http.Error(rw, "nonce cookie not found", http.StatusBadRequest)
|
||||||
// http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError)
|
return
|
||||||
// }
|
}
|
||||||
|
|
||||||
|
verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
|
||||||
|
idToken, err := verifier.Verify(ctx, rawIDToken)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Errorf("ID token verification failed: %s", err.Error())
|
||||||
|
http.Error(rw, "ID token verification failed", http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if idToken.Nonce != nonceCookie.Value {
|
||||||
|
http.Error(rw, "Nonce mismatch", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
projects := make([]string, 0)
|
projects := make([]string, 0)
|
||||||
|
|
||||||
// Extract custom claims
|
// Extract custom claims from userinfo
|
||||||
var claims struct {
|
var claims struct {
|
||||||
Username string `json:"preferred_username"`
|
Username string `json:"preferred_username"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Profile struct {
|
// Keycloak realm-level roles
|
||||||
|
RealmAccess struct {
|
||||||
|
Roles []string `json:"roles"`
|
||||||
|
} `json:"realm_access"`
|
||||||
|
// Keycloak client-level roles
|
||||||
|
ResourceAccess struct {
|
||||||
Client struct {
|
Client struct {
|
||||||
Roles []string `json:"roles"`
|
Roles []string `json:"roles"`
|
||||||
} `json:"clustercockpit"`
|
} `json:"clustercockpit"`
|
||||||
} `json:"resource_access"`
|
} `json:"resource_access"`
|
||||||
}
|
}
|
||||||
if err := userInfo.Claims(&claims); err != nil {
|
if err := userInfo.Claims(&claims); err != nil {
|
||||||
http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError)
|
cclog.Errorf("failed to extract claims: %s", err.Error())
|
||||||
|
http.Error(rw, "Failed to extract user claims", http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if claims.Username == "" {
|
||||||
|
http.Error(rw, "Username claim missing from OIDC provider", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge roles from both client-level and realm-level access
|
||||||
|
oidcRoles := append(claims.ResourceAccess.Client.Roles, claims.RealmAccess.Roles...)
|
||||||
|
|
||||||
|
roleSet := make(map[string]bool)
|
||||||
|
for _, r := range oidcRoles {
|
||||||
|
switch r {
|
||||||
|
case "user":
|
||||||
|
roleSet[schema.GetRoleString(schema.RoleUser)] = true
|
||||||
|
case "admin":
|
||||||
|
roleSet[schema.GetRoleString(schema.RoleAdmin)] = true
|
||||||
|
case "manager":
|
||||||
|
roleSet[schema.GetRoleString(schema.RoleManager)] = true
|
||||||
|
case "support":
|
||||||
|
roleSet[schema.GetRoleString(schema.RoleSupport)] = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var roles []string
|
var roles []string
|
||||||
for _, r := range claims.Profile.Client.Roles {
|
for role := range roleSet {
|
||||||
switch r {
|
roles = append(roles, role)
|
||||||
case "user":
|
|
||||||
roles = append(roles, schema.GetRoleString(schema.RoleUser))
|
|
||||||
case "admin":
|
|
||||||
roles = append(roles, schema.GetRoleString(schema.RoleAdmin))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(roles) == 0 {
|
if len(roles) == 0 {
|
||||||
@@ -168,14 +224,18 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
|||||||
AuthSource: schema.AuthViaOIDC,
|
AuthSource: schema.AuthViaOIDC,
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.Keys.OpenIDConfig.SyncUserOnLogin {
|
if Keys.OpenIDConfig.SyncUserOnLogin || Keys.OpenIDConfig.UpdateUserOnLogin {
|
||||||
persistUser(user)
|
handleOIDCUser(user)
|
||||||
}
|
}
|
||||||
|
|
||||||
oa.authentication.SaveSession(rw, r, user)
|
if err := oa.authentication.SaveSession(rw, r, user); err != nil {
|
||||||
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
cclog.Errorf("session save failed for user %q: %s", user.Username, err.Error())
|
||||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
http.Error(rw, "Failed to create session", http.StatusInternalServerError)
|
||||||
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(ctx))
|
return
|
||||||
|
}
|
||||||
|
cclog.Infof("login successful: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||||
|
userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||||
|
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
|
func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
|
||||||
@@ -190,7 +250,24 @@ func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
|
|||||||
codeVerifier := oauth2.GenerateVerifier()
|
codeVerifier := oauth2.GenerateVerifier()
|
||||||
setCallbackCookie(rw, r, "verifier", codeVerifier)
|
setCallbackCookie(rw, r, "verifier", codeVerifier)
|
||||||
|
|
||||||
|
// Generate nonce for ID token replay protection
|
||||||
|
nonce, err := randString(16)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(rw, "Internal error", http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setCallbackCookie(rw, r, "nonce", nonce)
|
||||||
|
|
||||||
|
// Build redirect URL from the incoming request
|
||||||
|
scheme := "https"
|
||||||
|
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
|
||||||
|
scheme = "http"
|
||||||
|
}
|
||||||
|
oa.client.RedirectURL = fmt.Sprintf("%s://%s/oidc-callback", scheme, r.Host)
|
||||||
|
|
||||||
// Redirect user to consent page to ask for permission
|
// Redirect user to consent page to ask for permission
|
||||||
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier))
|
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline,
|
||||||
|
oauth2.S256ChallengeOption(codeVerifier),
|
||||||
|
oidc.Nonce(nonce))
|
||||||
http.Redirect(rw, r, url, http.StatusFound)
|
http.Redirect(rw, r, url, http.StatusFound)
|
||||||
}
|
}
|
||||||
|
|||||||
111
internal/auth/schema.go
Normal file
111
internal/auth/schema.go
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package auth
|
||||||
|
|
||||||
|
var configSchema = `
|
||||||
|
{
|
||||||
|
"jwts": {
|
||||||
|
"description": "For JWT token authentication.",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"max-age": {
|
||||||
|
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"cookie-name": {
|
||||||
|
"description": "Cookie that should be checked for a JWT token.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"validate-user": {
|
||||||
|
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"trusted-issuer": {
|
||||||
|
"description": "Issuer that should be accepted when validating external JWTs ",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"sync-user-on-login": {
|
||||||
|
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"update-user-on-login": {
|
||||||
|
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided in JWT.",
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["max-age"]
|
||||||
|
},
|
||||||
|
"oidc": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"provider": {
|
||||||
|
"description": "OpenID Connect provider URL.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"sync-user-on-login": {
|
||||||
|
"description": "Add non-existent user to DB at login attempt with values provided.",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"update-user-on-login": {
|
||||||
|
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided.",
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["provider"]
|
||||||
|
},
|
||||||
|
"ldap": {
|
||||||
|
"description": "For LDAP Authentication and user synchronisation.",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"description": "URL of LDAP directory server.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"user-base": {
|
||||||
|
"description": "Base DN of user tree root.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"search-dn": {
|
||||||
|
"description": "DN for authenticating LDAP admin account with general read rights.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"user-bind": {
|
||||||
|
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"user-filter": {
|
||||||
|
"description": "Filter to extract users for syncing.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"username-attr": {
|
||||||
|
"description": "Attribute with full username. Default: gecos",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"sync-interval": {
|
||||||
|
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"sync-del-old-users": {
|
||||||
|
"description": "Delete obsolete users in database.",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"uid-attr": {
|
||||||
|
"description": "LDAP attribute used as login username. Default: uid",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"sync-user-on-login": {
|
||||||
|
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"update-user-on-login": {
|
||||||
|
"description": "Should an existent user attributes in the DB be updated at login attempt with values from LDAP.",
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url", "user-base", "search-dn", "user-bind", "user-filter"]
|
||||||
|
},
|
||||||
|
"required": ["jwts"]
|
||||||
|
}`
|
||||||
@@ -1,73 +1,158 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package config implements the program configuration data structures, validation and parsing
|
||||||
package config
|
package config
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"log"
|
"time"
|
||||||
"os"
|
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/resampler"
|
||||||
)
|
)
|
||||||
|
|
||||||
var Keys schema.ProgramConfig = schema.ProgramConfig{
|
type ProgramConfig struct {
|
||||||
|
// Address where the http (or https) server will listen on (for example: 'localhost:80').
|
||||||
|
Addr string `json:"addr"`
|
||||||
|
|
||||||
|
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
|
||||||
|
APIAllowedIPs []string `json:"api-allowed-ips"`
|
||||||
|
|
||||||
|
APISubjects *NATSConfig `json:"api-subjects"`
|
||||||
|
|
||||||
|
// Drop root permissions once .env was read and the port was taken.
|
||||||
|
User string `json:"user"`
|
||||||
|
Group string `json:"group"`
|
||||||
|
|
||||||
|
// Disable authentication (for everything: API, Web-UI, ...)
|
||||||
|
DisableAuthentication bool `json:"disable-authentication"`
|
||||||
|
|
||||||
|
// If `embed-static-files` is true (default), the frontend files are directly
|
||||||
|
// embeded into the go binary and expected to be in web/frontend. Only if
|
||||||
|
// it is false the files in `static-files` are served instead.
|
||||||
|
EmbedStaticFiles bool `json:"embed-static-files"`
|
||||||
|
StaticFiles string `json:"static-files"`
|
||||||
|
|
||||||
|
// Path to SQLite database file
|
||||||
|
DB string `json:"db"`
|
||||||
|
|
||||||
|
EnableJobTaggers bool `json:"enable-job-taggers"`
|
||||||
|
|
||||||
|
// Validate json input against schema
|
||||||
|
Validate bool `json:"validate"`
|
||||||
|
|
||||||
|
// If 0 or empty, the session does not expire!
|
||||||
|
SessionMaxAge string `json:"session-max-age"`
|
||||||
|
|
||||||
|
// If both those options are not empty, use HTTPS using those certificates.
|
||||||
|
HTTPSCertFile string `json:"https-cert-file"`
|
||||||
|
HTTPSKeyFile string `json:"https-key-file"`
|
||||||
|
|
||||||
|
// If not the empty string and `addr` does not end in ":80",
|
||||||
|
// redirect every request incoming at port 80 to that url.
|
||||||
|
RedirectHTTPTo string `json:"redirect-http-to"`
|
||||||
|
|
||||||
|
// Where to store MachineState files
|
||||||
|
MachineStateDir string `json:"machine-state-dir"`
|
||||||
|
|
||||||
|
// If not zero, automatically mark jobs as stopped running X seconds longer than their walltime.
|
||||||
|
StopJobsExceedingWalltime int `json:"stop-jobs-exceeding-walltime"`
|
||||||
|
|
||||||
|
// Defines time X in seconds in which jobs are considered to be "short" and will be filtered in specific views.
|
||||||
|
ShortRunningJobsDuration int `json:"short-running-jobs-duration"`
|
||||||
|
|
||||||
|
// Energy Mix CO2 Emission Constant [g/kWh]
|
||||||
|
// If entered, displays estimated CO2 emission for job based on jobs totalEnergy
|
||||||
|
EmissionConstant int `json:"emission-constant"`
|
||||||
|
|
||||||
|
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
|
||||||
|
EnableResampling *ResampleConfig `json:"resampling"`
|
||||||
|
|
||||||
|
// Systemd unit name for log viewer (default: "clustercockpit")
|
||||||
|
SystemdUnit string `json:"systemd-unit"`
|
||||||
|
|
||||||
|
// Node state retention configuration
|
||||||
|
NodeStateRetention *NodeStateRetention `json:"nodestate-retention"`
|
||||||
|
|
||||||
|
// Database tuning configuration
|
||||||
|
DbConfig *DbConfig `json:"db-config"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type DbConfig struct {
|
||||||
|
CacheSizeMB int `json:"cache-size-mb"`
|
||||||
|
SoftHeapLimitMB int `json:"soft-heap-limit-mb"`
|
||||||
|
MaxOpenConnections int `json:"max-open-connections"`
|
||||||
|
MaxIdleConnections int `json:"max-idle-connections"`
|
||||||
|
ConnectionMaxIdleTimeMins int `json:"max-idle-time-minutes"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NodeStateRetention struct {
|
||||||
|
Policy string `json:"policy"` // "delete" or "move"
|
||||||
|
Age int `json:"age"` // hours, default 24
|
||||||
|
TargetKind string `json:"target-kind"` // "file" or "s3"
|
||||||
|
TargetPath string `json:"target-path"`
|
||||||
|
TargetEndpoint string `json:"target-endpoint"`
|
||||||
|
TargetBucket string `json:"target-bucket"`
|
||||||
|
TargetAccessKey string `json:"target-access-key"`
|
||||||
|
TargetSecretKey string `json:"target-secret-key"`
|
||||||
|
TargetRegion string `json:"target-region"`
|
||||||
|
TargetUsePathStyle bool `json:"target-use-path-style"`
|
||||||
|
MaxFileSizeMB int `json:"max-file-size-mb"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type ResampleConfig struct {
|
||||||
|
// Minimum number of points to trigger resampling of data
|
||||||
|
MinimumPoints int `json:"minimum-points"`
|
||||||
|
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
|
||||||
|
Resolutions []int `json:"resolutions"`
|
||||||
|
// Trigger next zoom level at less than this many visible datapoints
|
||||||
|
Trigger int `json:"trigger"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NATSConfig struct {
|
||||||
|
SubjectJobEvent string `json:"subject-job-event"`
|
||||||
|
SubjectNodeState string `json:"subject-node-state"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type IntRange struct {
|
||||||
|
From int `json:"from"`
|
||||||
|
To int `json:"to"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type TimeRange struct {
|
||||||
|
From *time.Time `json:"from"`
|
||||||
|
To *time.Time `json:"to"`
|
||||||
|
Range string `json:"range,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type FilterRanges struct {
|
||||||
|
Duration *IntRange `json:"duration"`
|
||||||
|
NumNodes *IntRange `json:"num-nodes"`
|
||||||
|
StartTime *TimeRange `json:"start-time"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var Keys ProgramConfig = ProgramConfig{
|
||||||
Addr: "localhost:8080",
|
Addr: "localhost:8080",
|
||||||
DisableAuthentication: false,
|
|
||||||
EmbedStaticFiles: true,
|
EmbedStaticFiles: true,
|
||||||
DBDriver: "sqlite3",
|
|
||||||
DB: "./var/job.db",
|
DB: "./var/job.db",
|
||||||
Archive: json.RawMessage(`{\"kind\":\"file\",\"path\":\"./var/job-archive\"}`),
|
|
||||||
DisableArchive: false,
|
|
||||||
Validate: false,
|
|
||||||
SessionMaxAge: "168h",
|
SessionMaxAge: "168h",
|
||||||
StopJobsExceedingWalltime: 0,
|
StopJobsExceedingWalltime: 0,
|
||||||
ShortRunningJobsDuration: 5 * 60,
|
ShortRunningJobsDuration: 5 * 60,
|
||||||
UiDefaults: map[string]interface{}{
|
|
||||||
"analysis_view_histogramMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
|
||||||
"analysis_view_scatterPlotMetrics": [][]string{{"flops_any", "mem_bw"}, {"flops_any", "cpu_load"}, {"cpu_load", "mem_bw"}},
|
|
||||||
"job_view_nodestats_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
|
||||||
"job_view_polarPlotMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
|
||||||
"job_view_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
|
||||||
"job_view_showFootprint": true,
|
|
||||||
"job_list_usePaging": true,
|
|
||||||
"plot_general_colorBackground": true,
|
|
||||||
"plot_general_colorscheme": []string{"#00bfff", "#0000ff", "#ff00ff", "#ff0000", "#ff8000", "#ffff00", "#80ff00"},
|
|
||||||
"plot_general_lineWidth": 3,
|
|
||||||
"plot_list_jobsPerPage": 50,
|
|
||||||
"plot_list_selectedMetrics": []string{"cpu_load", "mem_used", "flops_any", "mem_bw"},
|
|
||||||
"plot_view_plotsPerRow": 3,
|
|
||||||
"plot_view_showPolarplot": true,
|
|
||||||
"plot_view_showRoofline": true,
|
|
||||||
"plot_view_showStatTable": true,
|
|
||||||
"system_view_selectedMetric": "cpu_load",
|
|
||||||
"analysis_view_selectedTopEntity": "user",
|
|
||||||
"analysis_view_selectedTopCategory": "totalWalltime",
|
|
||||||
"status_view_selectedTopUserCategory": "totalJobs",
|
|
||||||
"status_view_selectedTopProjectCategory": "totalJobs",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func Init(flagConfigFile string) {
|
func Init(mainConfig json.RawMessage) {
|
||||||
raw, err := os.ReadFile(flagConfigFile)
|
Validate(configSchema, mainConfig)
|
||||||
if err != nil {
|
dec := json.NewDecoder(bytes.NewReader(mainConfig))
|
||||||
if !os.IsNotExist(err) {
|
dec.DisallowUnknownFields()
|
||||||
log.Fatalf("CONFIG ERROR: %v", err)
|
if err := dec.Decode(&Keys); err != nil {
|
||||||
}
|
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
|
||||||
} else {
|
}
|
||||||
if err := schema.Validate(schema.Config, bytes.NewReader(raw)); err != nil {
|
|
||||||
log.Fatalf("Validate config: %v\n", err)
|
|
||||||
}
|
|
||||||
dec := json.NewDecoder(bytes.NewReader(raw))
|
|
||||||
dec.DisallowUnknownFields()
|
|
||||||
if err := dec.Decode(&Keys); err != nil {
|
|
||||||
log.Fatalf("could not decode: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if Keys.Clusters == nil || len(Keys.Clusters) < 1 {
|
if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
|
||||||
log.Fatal("At least one cluster required in config!")
|
resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,16 +1,26 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package config
|
package config
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestInit(t *testing.T) {
|
func TestInit(t *testing.T) {
|
||||||
fp := "../../configs/config.json"
|
fp := "../../configs/config.json"
|
||||||
Init(fp)
|
ccconf.Init(fp)
|
||||||
|
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||||
|
Init(cfg)
|
||||||
|
} else {
|
||||||
|
cclog.Abort("Main configuration must be present")
|
||||||
|
}
|
||||||
|
|
||||||
if Keys.Addr != "0.0.0.0:443" {
|
if Keys.Addr != "0.0.0.0:443" {
|
||||||
t.Errorf("wrong addr\ngot: %s \nwant: 0.0.0.0:443", Keys.Addr)
|
t.Errorf("wrong addr\ngot: %s \nwant: 0.0.0.0:443", Keys.Addr)
|
||||||
}
|
}
|
||||||
@@ -18,7 +28,13 @@ func TestInit(t *testing.T) {
|
|||||||
|
|
||||||
func TestInitMinimal(t *testing.T) {
|
func TestInitMinimal(t *testing.T) {
|
||||||
fp := "../../configs/config-demo.json"
|
fp := "../../configs/config-demo.json"
|
||||||
Init(fp)
|
ccconf.Init(fp)
|
||||||
|
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||||
|
Init(cfg)
|
||||||
|
} else {
|
||||||
|
cclog.Abort("Main configuration must be present")
|
||||||
|
}
|
||||||
|
|
||||||
if Keys.Addr != "127.0.0.1:8080" {
|
if Keys.Addr != "127.0.0.1:8080" {
|
||||||
t.Errorf("wrong addr\ngot: %s \nwant: 127.0.0.1:8080", Keys.Addr)
|
t.Errorf("wrong addr\ngot: %s \nwant: 127.0.0.1:8080", Keys.Addr)
|
||||||
}
|
}
|
||||||
|
|||||||
51
internal/config/default_metrics.go
Normal file
51
internal/config/default_metrics.go
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DEPRECATED: SUPERSEDED BY NEW USER CONFIG - userConfig.go / web.go
|
||||||
|
|
||||||
|
type DefaultMetricsCluster struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
DefaultMetrics string `json:"default-metrics"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type DefaultMetricsConfig struct {
|
||||||
|
Clusters []DefaultMetricsCluster `json:"clusters"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadDefaultMetricsConfig() (*DefaultMetricsConfig, error) {
|
||||||
|
filePath := "default_metrics.json"
|
||||||
|
if _, err := os.Stat(filePath); os.IsNotExist(err) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(filePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var cfg DefaultMetricsConfig
|
||||||
|
if err := json.Unmarshal(data, &cfg); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseMetricsString(s string) []string {
|
||||||
|
parts := strings.Split(s, ",")
|
||||||
|
var metrics []string
|
||||||
|
for _, p := range parts {
|
||||||
|
trimmed := strings.TrimSpace(p)
|
||||||
|
if trimmed != "" {
|
||||||
|
metrics = append(metrics, trimmed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return metrics
|
||||||
|
}
|
||||||
208
internal/config/schema.go
Normal file
208
internal/config/schema.go
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
var configSchema = `
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"addr": {
|
||||||
|
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"api-allowed-ips": {
|
||||||
|
"description": "Addresses from which secured API endpoints can be reached",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"user": {
|
||||||
|
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"group": {
|
||||||
|
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"disable-authentication": {
|
||||||
|
"description": "Disable authentication (for everything: API, Web-UI, ...).",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"embed-static-files": {
|
||||||
|
"description": "If all files in web/frontend/public should be served from within the binary itself (they are embedded) or not.",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"static-files": {
|
||||||
|
"description": "Folder where static assets can be found, if embed-static-files is false.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"db": {
|
||||||
|
"description": "Path to SQLite database file (e.g., './var/job.db')",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"enable-job-taggers": {
|
||||||
|
"description": "Turn on automatic application and jobclass taggers",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"validate": {
|
||||||
|
"description": "Validate all input json documents against json schema.",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"session-max-age": {
|
||||||
|
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"https-cert-file": {
|
||||||
|
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"https-key-file": {
|
||||||
|
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"redirect-http-to": {
|
||||||
|
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"stop-jobs-exceeding-walltime": {
|
||||||
|
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"short-running-jobs-duration": {
|
||||||
|
"description": "Do not show running jobs shorter than X seconds.",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"emission-constant": {
|
||||||
|
"description": "Energy mix CO2 emission constant [g/kWh]. If set, displays estimated CO2 emission for jobs.",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"machine-state-dir": {
|
||||||
|
"description": "Where to store MachineState files.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"systemd-unit": {
|
||||||
|
"description": "Systemd unit name for log viewer (default: 'clustercockpit').",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"resampling": {
|
||||||
|
"description": "Enable dynamic zoom in frontend metric plots.",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"minimum-points": {
|
||||||
|
"description": "Minimum points to trigger resampling of time-series data.",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"trigger": {
|
||||||
|
"description": "Trigger next zoom level at less than this many visible datapoints.",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"resolutions": {
|
||||||
|
"description": "Array of resampling target resolutions, in seconds.",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["trigger", "resolutions"]
|
||||||
|
},
|
||||||
|
"api-subjects": {
|
||||||
|
"description": "NATS subjects configuration for subscribing to job and node events.",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"subject-job-event": {
|
||||||
|
"description": "NATS subject for job events (start_job, stop_job)",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"subject-node-state": {
|
||||||
|
"description": "NATS subject for node state updates",
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["subject-job-event", "subject-node-state"]
|
||||||
|
},
|
||||||
|
"nodestate-retention": {
|
||||||
|
"description": "Node state retention configuration for cleaning up old node_state rows.",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"policy": {
|
||||||
|
"description": "Retention policy: 'delete' to remove old rows, 'move' to archive to Parquet then delete.",
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["delete", "move"]
|
||||||
|
},
|
||||||
|
"age": {
|
||||||
|
"description": "Retention age in hours (default: 24).",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"target-kind": {
|
||||||
|
"description": "Target kind for parquet archiving: 'file' or 's3'.",
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["file", "s3"]
|
||||||
|
},
|
||||||
|
"target-path": {
|
||||||
|
"description": "Filesystem path for parquet file target.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"target-endpoint": {
|
||||||
|
"description": "S3 endpoint URL.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"target-bucket": {
|
||||||
|
"description": "S3 bucket name.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"target-access-key": {
|
||||||
|
"description": "S3 access key.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"target-secret-key": {
|
||||||
|
"description": "S3 secret key.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"target-region": {
|
||||||
|
"description": "S3 region.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"target-use-path-style": {
|
||||||
|
"description": "Use path-style S3 addressing.",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"max-file-size-mb": {
|
||||||
|
"description": "Maximum parquet file size in MB (default: 128).",
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["policy"]
|
||||||
|
},
|
||||||
|
"db-config": {
|
||||||
|
"description": "SQLite database tuning configuration.",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"cache-size-mb": {
|
||||||
|
"description": "SQLite page cache size per connection in MB (default: 2048).",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"soft-heap-limit-mb": {
|
||||||
|
"description": "Process-wide SQLite soft heap limit in MB (default: 16384).",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"max-open-connections": {
|
||||||
|
"description": "Maximum number of open database connections (default: 4).",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"max-idle-connections": {
|
||||||
|
"description": "Maximum number of idle database connections (default: 4).",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"max-idle-time-minutes": {
|
||||||
|
"description": "Maximum idle time for a connection in minutes (default: 10).",
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`
|
||||||
29
internal/config/validate.go
Normal file
29
internal/config/validate.go
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-backend.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
"github.com/santhosh-tekuri/jsonschema/v5"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Validate(schema string, instance json.RawMessage) {
|
||||||
|
sch, err := jsonschema.CompileString("schema.json", schema)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Fatalf("%#v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var v any
|
||||||
|
if err := json.Unmarshal([]byte(instance), &v); err != nil {
|
||||||
|
cclog.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = sch.Validate(v); err != nil {
|
||||||
|
cclog.Fatalf("%#v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,6 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package model
|
package model
|
||||||
|
|||||||
@@ -3,24 +3,50 @@
|
|||||||
package model
|
package model
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type ClusterMetricWithName struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Unit *schema.Unit `json:"unit,omitempty"`
|
||||||
|
Timestep int `json:"timestep"`
|
||||||
|
Data []schema.Float `json:"data"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type ClusterMetrics struct {
|
||||||
|
NodeCount int `json:"nodeCount"`
|
||||||
|
Metrics []*ClusterMetricWithName `json:"metrics"`
|
||||||
|
}
|
||||||
|
|
||||||
type Count struct {
|
type Count struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Count int `json:"count"`
|
Count int `json:"count"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type EnergyFootprintValue struct {
|
||||||
|
Hardware string `json:"hardware"`
|
||||||
|
Metric string `json:"metric"`
|
||||||
|
Value float64 `json:"value"`
|
||||||
|
}
|
||||||
|
|
||||||
type FloatRange struct {
|
type FloatRange struct {
|
||||||
From float64 `json:"from"`
|
From float64 `json:"from"`
|
||||||
To float64 `json:"to"`
|
To float64 `json:"to"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type FootprintValue struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Stat string `json:"stat"`
|
||||||
|
Value float64 `json:"value"`
|
||||||
|
}
|
||||||
|
|
||||||
type Footprints struct {
|
type Footprints struct {
|
||||||
TimeWeights *TimeWeights `json:"timeWeights"`
|
TimeWeights *TimeWeights `json:"timeWeights"`
|
||||||
Metrics []*MetricFootprints `json:"metrics"`
|
Metrics []*MetricFootprints `json:"metrics"`
|
||||||
@@ -38,25 +64,26 @@ type IntRangeOutput struct {
|
|||||||
|
|
||||||
type JobFilter struct {
|
type JobFilter struct {
|
||||||
Tags []string `json:"tags,omitempty"`
|
Tags []string `json:"tags,omitempty"`
|
||||||
|
DbID []string `json:"dbId,omitempty"`
|
||||||
JobID *StringInput `json:"jobId,omitempty"`
|
JobID *StringInput `json:"jobId,omitempty"`
|
||||||
ArrayJobID *int `json:"arrayJobId,omitempty"`
|
ArrayJobID *int `json:"arrayJobId,omitempty"`
|
||||||
User *StringInput `json:"user,omitempty"`
|
User *StringInput `json:"user,omitempty"`
|
||||||
Project *StringInput `json:"project,omitempty"`
|
Project *StringInput `json:"project,omitempty"`
|
||||||
JobName *StringInput `json:"jobName,omitempty"`
|
JobName *StringInput `json:"jobName,omitempty"`
|
||||||
Cluster *StringInput `json:"cluster,omitempty"`
|
Cluster *StringInput `json:"cluster,omitempty"`
|
||||||
|
SubCluster *StringInput `json:"subCluster,omitempty"`
|
||||||
Partition *StringInput `json:"partition,omitempty"`
|
Partition *StringInput `json:"partition,omitempty"`
|
||||||
Duration *schema.IntRange `json:"duration,omitempty"`
|
Duration *config.IntRange `json:"duration,omitempty"`
|
||||||
|
Energy *FloatRange `json:"energy,omitempty"`
|
||||||
MinRunningFor *int `json:"minRunningFor,omitempty"`
|
MinRunningFor *int `json:"minRunningFor,omitempty"`
|
||||||
NumNodes *schema.IntRange `json:"numNodes,omitempty"`
|
NumNodes *config.IntRange `json:"numNodes,omitempty"`
|
||||||
NumAccelerators *schema.IntRange `json:"numAccelerators,omitempty"`
|
NumAccelerators *config.IntRange `json:"numAccelerators,omitempty"`
|
||||||
NumHWThreads *schema.IntRange `json:"numHWThreads,omitempty"`
|
NumHWThreads *config.IntRange `json:"numHWThreads,omitempty"`
|
||||||
StartTime *schema.TimeRange `json:"startTime,omitempty"`
|
StartTime *config.TimeRange `json:"startTime,omitempty"`
|
||||||
State []schema.JobState `json:"state,omitempty"`
|
State []schema.JobState `json:"state,omitempty"`
|
||||||
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg,omitempty"`
|
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
|
||||||
MemBwAvg *FloatRange `json:"memBwAvg,omitempty"`
|
Shared *string `json:"shared,omitempty"`
|
||||||
LoadAvg *FloatRange `json:"loadAvg,omitempty"`
|
Schedule *string `json:"schedule,omitempty"`
|
||||||
MemUsedMax *FloatRange `json:"memUsedMax,omitempty"`
|
|
||||||
Exclusive *int `json:"exclusive,omitempty"`
|
|
||||||
Node *StringInput `json:"node,omitempty"`
|
Node *StringInput `json:"node,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -85,9 +112,23 @@ type JobResultList struct {
|
|||||||
HasNextPage *bool `json:"hasNextPage,omitempty"`
|
HasNextPage *bool `json:"hasNextPage,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type JobStats struct {
|
||||||
|
ID int `json:"id"`
|
||||||
|
JobID string `json:"jobId"`
|
||||||
|
StartTime int `json:"startTime"`
|
||||||
|
Duration int `json:"duration"`
|
||||||
|
Cluster string `json:"cluster"`
|
||||||
|
SubCluster string `json:"subCluster"`
|
||||||
|
NumNodes int `json:"numNodes"`
|
||||||
|
NumHWThreads *int `json:"numHWThreads,omitempty"`
|
||||||
|
NumAccelerators *int `json:"numAccelerators,omitempty"`
|
||||||
|
Stats []*NamedStats `json:"stats"`
|
||||||
|
}
|
||||||
|
|
||||||
type JobsStatistics struct {
|
type JobsStatistics struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
|
TotalUsers int `json:"totalUsers"`
|
||||||
TotalJobs int `json:"totalJobs"`
|
TotalJobs int `json:"totalJobs"`
|
||||||
RunningJobs int `json:"runningJobs"`
|
RunningJobs int `json:"runningJobs"`
|
||||||
ShortJobs int `json:"shortJobs"`
|
ShortJobs int `json:"shortJobs"`
|
||||||
@@ -120,20 +161,73 @@ type MetricHistoPoint struct {
|
|||||||
type MetricHistoPoints struct {
|
type MetricHistoPoints struct {
|
||||||
Metric string `json:"metric"`
|
Metric string `json:"metric"`
|
||||||
Unit string `json:"unit"`
|
Unit string `json:"unit"`
|
||||||
|
Stat *string `json:"stat,omitempty"`
|
||||||
Data []*MetricHistoPoint `json:"data,omitempty"`
|
Data []*MetricHistoPoint `json:"data,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type MetricStatItem struct {
|
||||||
|
MetricName string `json:"metricName"`
|
||||||
|
Range *FloatRange `json:"range"`
|
||||||
|
}
|
||||||
|
|
||||||
type Mutation struct {
|
type Mutation struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type NamedStats struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Data *schema.MetricStatistics `json:"data"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NamedStatsWithScope struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Scope schema.MetricScope `json:"scope"`
|
||||||
|
Stats []*ScopedStats `json:"stats"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NodeFilter struct {
|
||||||
|
Hostname *StringInput `json:"hostname,omitempty"`
|
||||||
|
Cluster *StringInput `json:"cluster,omitempty"`
|
||||||
|
SubCluster *StringInput `json:"subCluster,omitempty"`
|
||||||
|
SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"`
|
||||||
|
HealthState *string `json:"healthState,omitempty"`
|
||||||
|
TimeStart *int `json:"timeStart,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type NodeMetrics struct {
|
type NodeMetrics struct {
|
||||||
Host string `json:"host"`
|
Host string `json:"host"`
|
||||||
|
State string `json:"state"`
|
||||||
SubCluster string `json:"subCluster"`
|
SubCluster string `json:"subCluster"`
|
||||||
Metrics []*JobMetricWithName `json:"metrics"`
|
Metrics []*JobMetricWithName `json:"metrics"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type NodeStateResultList struct {
|
||||||
|
Items []*schema.Node `json:"items"`
|
||||||
|
Count *int `json:"count,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NodeStates struct {
|
||||||
|
State string `json:"state"`
|
||||||
|
Count int `json:"count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NodeStatesTimed struct {
|
||||||
|
State string `json:"state"`
|
||||||
|
Counts []int `json:"counts"`
|
||||||
|
Times []int `json:"times"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NodesResultList struct {
|
||||||
|
Items []*NodeMetrics `json:"items"`
|
||||||
|
Offset *int `json:"offset,omitempty"`
|
||||||
|
Limit *int `json:"limit,omitempty"`
|
||||||
|
Count *int `json:"count,omitempty"`
|
||||||
|
TotalNodes *int `json:"totalNodes,omitempty"`
|
||||||
|
HasNextPage *bool `json:"hasNextPage,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type OrderByInput struct {
|
type OrderByInput struct {
|
||||||
Field string `json:"field"`
|
Field string `json:"field"`
|
||||||
|
Type string `json:"type"`
|
||||||
Order SortDirectionEnum `json:"order"`
|
Order SortDirectionEnum `json:"order"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -142,7 +236,10 @@ type PageRequest struct {
|
|||||||
Page int `json:"page"`
|
Page int `json:"page"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Query struct {
|
type ScopedStats struct {
|
||||||
|
Hostname string `json:"hostname"`
|
||||||
|
ID *string `json:"id,omitempty"`
|
||||||
|
Data *schema.MetricStatistics `json:"data"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type StringInput struct {
|
type StringInput struct {
|
||||||
@@ -155,8 +252,9 @@ type StringInput struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type TimeRangeOutput struct {
|
type TimeRangeOutput struct {
|
||||||
From time.Time `json:"from"`
|
Range *string `json:"range,omitempty"`
|
||||||
To time.Time `json:"to"`
|
From time.Time `json:"from"`
|
||||||
|
To time.Time `json:"to"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type TimeWeights struct {
|
type TimeWeights struct {
|
||||||
@@ -174,20 +272,22 @@ type User struct {
|
|||||||
type Aggregate string
|
type Aggregate string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
AggregateUser Aggregate = "USER"
|
AggregateUser Aggregate = "USER"
|
||||||
AggregateProject Aggregate = "PROJECT"
|
AggregateProject Aggregate = "PROJECT"
|
||||||
AggregateCluster Aggregate = "CLUSTER"
|
AggregateCluster Aggregate = "CLUSTER"
|
||||||
|
AggregateSubcluster Aggregate = "SUBCLUSTER"
|
||||||
)
|
)
|
||||||
|
|
||||||
var AllAggregate = []Aggregate{
|
var AllAggregate = []Aggregate{
|
||||||
AggregateUser,
|
AggregateUser,
|
||||||
AggregateProject,
|
AggregateProject,
|
||||||
AggregateCluster,
|
AggregateCluster,
|
||||||
|
AggregateSubcluster,
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e Aggregate) IsValid() bool {
|
func (e Aggregate) IsValid() bool {
|
||||||
switch e {
|
switch e {
|
||||||
case AggregateUser, AggregateProject, AggregateCluster:
|
case AggregateUser, AggregateProject, AggregateCluster, AggregateSubcluster:
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
@@ -197,7 +297,7 @@ func (e Aggregate) String() string {
|
|||||||
return string(e)
|
return string(e)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *Aggregate) UnmarshalGQL(v interface{}) error {
|
func (e *Aggregate) UnmarshalGQL(v any) error {
|
||||||
str, ok := v.(string)
|
str, ok := v.(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("enums must be strings")
|
return fmt.Errorf("enums must be strings")
|
||||||
@@ -214,11 +314,26 @@ func (e Aggregate) MarshalGQL(w io.Writer) {
|
|||||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *Aggregate) UnmarshalJSON(b []byte) error {
|
||||||
|
s, err := strconv.Unquote(string(b))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return e.UnmarshalGQL(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e Aggregate) MarshalJSON() ([]byte, error) {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
e.MarshalGQL(&buf)
|
||||||
|
return buf.Bytes(), nil
|
||||||
|
}
|
||||||
|
|
||||||
type SortByAggregate string
|
type SortByAggregate string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
SortByAggregateTotalwalltime SortByAggregate = "TOTALWALLTIME"
|
SortByAggregateTotalwalltime SortByAggregate = "TOTALWALLTIME"
|
||||||
SortByAggregateTotaljobs SortByAggregate = "TOTALJOBS"
|
SortByAggregateTotaljobs SortByAggregate = "TOTALJOBS"
|
||||||
|
SortByAggregateTotalusers SortByAggregate = "TOTALUSERS"
|
||||||
SortByAggregateTotalnodes SortByAggregate = "TOTALNODES"
|
SortByAggregateTotalnodes SortByAggregate = "TOTALNODES"
|
||||||
SortByAggregateTotalnodehours SortByAggregate = "TOTALNODEHOURS"
|
SortByAggregateTotalnodehours SortByAggregate = "TOTALNODEHOURS"
|
||||||
SortByAggregateTotalcores SortByAggregate = "TOTALCORES"
|
SortByAggregateTotalcores SortByAggregate = "TOTALCORES"
|
||||||
@@ -230,6 +345,7 @@ const (
|
|||||||
var AllSortByAggregate = []SortByAggregate{
|
var AllSortByAggregate = []SortByAggregate{
|
||||||
SortByAggregateTotalwalltime,
|
SortByAggregateTotalwalltime,
|
||||||
SortByAggregateTotaljobs,
|
SortByAggregateTotaljobs,
|
||||||
|
SortByAggregateTotalusers,
|
||||||
SortByAggregateTotalnodes,
|
SortByAggregateTotalnodes,
|
||||||
SortByAggregateTotalnodehours,
|
SortByAggregateTotalnodehours,
|
||||||
SortByAggregateTotalcores,
|
SortByAggregateTotalcores,
|
||||||
@@ -240,7 +356,7 @@ var AllSortByAggregate = []SortByAggregate{
|
|||||||
|
|
||||||
func (e SortByAggregate) IsValid() bool {
|
func (e SortByAggregate) IsValid() bool {
|
||||||
switch e {
|
switch e {
|
||||||
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
|
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalusers, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
@@ -250,7 +366,7 @@ func (e SortByAggregate) String() string {
|
|||||||
return string(e)
|
return string(e)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *SortByAggregate) UnmarshalGQL(v interface{}) error {
|
func (e *SortByAggregate) UnmarshalGQL(v any) error {
|
||||||
str, ok := v.(string)
|
str, ok := v.(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("enums must be strings")
|
return fmt.Errorf("enums must be strings")
|
||||||
@@ -267,6 +383,20 @@ func (e SortByAggregate) MarshalGQL(w io.Writer) {
|
|||||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *SortByAggregate) UnmarshalJSON(b []byte) error {
|
||||||
|
s, err := strconv.Unquote(string(b))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return e.UnmarshalGQL(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e SortByAggregate) MarshalJSON() ([]byte, error) {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
e.MarshalGQL(&buf)
|
||||||
|
return buf.Bytes(), nil
|
||||||
|
}
|
||||||
|
|
||||||
type SortDirectionEnum string
|
type SortDirectionEnum string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -291,7 +421,7 @@ func (e SortDirectionEnum) String() string {
|
|||||||
return string(e)
|
return string(e)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
|
func (e *SortDirectionEnum) UnmarshalGQL(v any) error {
|
||||||
str, ok := v.(string)
|
str, ok := v.(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("enums must be strings")
|
return fmt.Errorf("enums must be strings")
|
||||||
@@ -307,3 +437,17 @@ func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
|
|||||||
func (e SortDirectionEnum) MarshalGQL(w io.Writer) {
|
func (e SortDirectionEnum) MarshalGQL(w io.Writer) {
|
||||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *SortDirectionEnum) UnmarshalJSON(b []byte) error {
|
||||||
|
s, err := strconv.Unquote(string(b))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return e.UnmarshalGQL(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e SortDirectionEnum) MarshalJSON() ([]byte, error) {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
e.MarshalGQL(&buf)
|
||||||
|
return buf.Bytes(), nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,15 +1,39 @@
|
|||||||
package graph
|
package graph
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"sync"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
)
|
)
|
||||||
|
|
||||||
// This file will not be regenerated automatically.
|
// This file will not be regenerated automatically.
|
||||||
//
|
//
|
||||||
// It serves as dependency injection for your app, add any dependencies you require here.
|
// It serves as dependency injection for your app, add any dependencies you require here.
|
||||||
|
var (
|
||||||
|
initOnce sync.Once
|
||||||
|
resolverInstance *Resolver
|
||||||
|
)
|
||||||
|
|
||||||
type Resolver struct {
|
type Resolver struct {
|
||||||
DB *sqlx.DB
|
DB *sqlx.DB
|
||||||
Repo *repository.JobRepository
|
Repo *repository.JobRepository
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Init() {
|
||||||
|
initOnce.Do(func() {
|
||||||
|
db := repository.GetConnection()
|
||||||
|
resolverInstance = &Resolver{
|
||||||
|
DB: db.DB, Repo: repository.GetJobRepository(),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetResolverInstance() *Resolver {
|
||||||
|
if resolverInstance == nil {
|
||||||
|
cclog.Fatal("Authentication module not initialized!")
|
||||||
|
}
|
||||||
|
|
||||||
|
return resolverInstance
|
||||||
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,20 +1,21 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package graph
|
package graph
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"slices"
|
||||||
|
|
||||||
"github.com/99designs/gqlgen/graphql"
|
"github.com/99designs/gqlgen/graphql"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
// "github.com/ClusterCockpit/cc-backend/pkg/archive"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const MAX_JOBS_FOR_ANALYSIS = 500
|
const MAX_JOBS_FOR_ANALYSIS = 500
|
||||||
@@ -24,11 +25,11 @@ func (r *queryResolver) rooflineHeatmap(
|
|||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
filter []*model.JobFilter,
|
filter []*model.JobFilter,
|
||||||
rows int, cols int,
|
rows int, cols int,
|
||||||
minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
|
minX float64, minY float64, maxX float64, maxY float64,
|
||||||
|
) ([][]float64, error) {
|
||||||
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("Error while querying jobs for roofline")
|
cclog.Error("Error while querying jobs for roofline")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
||||||
@@ -47,15 +48,22 @@ func (r *queryResolver) rooflineHeatmap(
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
|
// metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||||
|
// resolution := 0
|
||||||
|
|
||||||
|
// for _, mc := range metricConfigs {
|
||||||
|
// resolution = max(resolution, mc.Timestep)
|
||||||
|
// }
|
||||||
|
|
||||||
|
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error while loading roofline metrics for job %d", job.ID)
|
cclog.Warnf("Error while loading roofline metrics for job %d", *job.ID)
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
|
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
|
||||||
if flops_ == nil && membw_ == nil {
|
if flops_ == nil && membw_ == nil {
|
||||||
log.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", *job.ID)
|
||||||
continue
|
continue
|
||||||
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||||
}
|
}
|
||||||
@@ -63,7 +71,7 @@ func (r *queryResolver) rooflineHeatmap(
|
|||||||
flops, ok1 := flops_["node"]
|
flops, ok1 := flops_["node"]
|
||||||
membw, ok2 := membw_["node"]
|
membw, ok2 := membw_["node"]
|
||||||
if !ok1 || !ok2 {
|
if !ok1 || !ok2 {
|
||||||
log.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
cclog.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||||
continue
|
continue
|
||||||
// TODO/FIXME:
|
// TODO/FIXME:
|
||||||
// return nil, errors.New("GRAPH/UTIL > todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
// return nil, errors.New("GRAPH/UTIL > todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||||
@@ -98,7 +106,7 @@ func (r *queryResolver) rooflineHeatmap(
|
|||||||
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||||
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("Error while querying jobs for footprint")
|
cclog.Error("Error while querying jobs for footprint")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
||||||
@@ -120,8 +128,8 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||||
log.Error("Error while loading averages for footprint")
|
cclog.Error("Error while loading averages for footprint")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -179,11 +187,5 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
|||||||
func requireField(ctx context.Context, name string) bool {
|
func requireField(ctx context.Context, name string) bool {
|
||||||
fields := graphql.CollectAllFields(ctx)
|
fields := graphql.CollectAllFields(ctx)
|
||||||
|
|
||||||
for _, f := range fields {
|
return slices.Contains(fields, name)
|
||||||
if f == name {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|||||||
132
internal/importer/README.md
Normal file
132
internal/importer/README.md
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
# Importer Package
|
||||||
|
|
||||||
|
The `importer` package provides functionality for importing job data into the ClusterCockpit database from archived job files.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This package supports two primary import workflows:
|
||||||
|
|
||||||
|
1. **Bulk Database Initialization** - Reinitialize the entire job database from archived jobs
|
||||||
|
2. **Individual Job Import** - Import specific jobs from metadata/data file pairs
|
||||||
|
|
||||||
|
Both workflows enrich job metadata by calculating performance footprints and energy consumption metrics before persisting to the database.
|
||||||
|
|
||||||
|
## Main Entry Points
|
||||||
|
|
||||||
|
### InitDB()
|
||||||
|
|
||||||
|
Reinitializes the job database from all archived jobs.
|
||||||
|
|
||||||
|
```go
|
||||||
|
if err := importer.InitDB(); err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This function:
|
||||||
|
- Flushes existing job, tag, and jobtag tables
|
||||||
|
- Iterates through all jobs in the configured archive
|
||||||
|
- Enriches each job with calculated metrics
|
||||||
|
- Inserts jobs into the database in batched transactions (100 jobs per batch)
|
||||||
|
- Continues on individual job failures, logging errors
|
||||||
|
|
||||||
|
**Use Case**: Initial database setup or complete database rebuild from archive.
|
||||||
|
|
||||||
|
### HandleImportFlag(flag string)
|
||||||
|
|
||||||
|
Imports jobs from specified file pairs.
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Format: "<meta.json>:<data.json>[,<meta2.json>:<data2.json>,...]"
|
||||||
|
flag := "/path/to/meta.json:/path/to/data.json"
|
||||||
|
if err := importer.HandleImportFlag(flag); err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This function:
|
||||||
|
- Parses the comma-separated file pairs
|
||||||
|
- Validates metadata and job data against schemas (if validation enabled)
|
||||||
|
- Enriches each job with footprints and energy metrics
|
||||||
|
- Imports jobs into both the archive and database
|
||||||
|
- Fails fast on the first error
|
||||||
|
|
||||||
|
**Use Case**: Importing specific jobs from external sources or manual job additions.
|
||||||
|
|
||||||
|
## Job Enrichment
|
||||||
|
|
||||||
|
Both import workflows use `enrichJobMetadata()` to calculate:
|
||||||
|
|
||||||
|
### Performance Footprints
|
||||||
|
|
||||||
|
Performance footprints are calculated from metric averages based on the subcluster configuration:
|
||||||
|
|
||||||
|
```go
|
||||||
|
job.Footprint["mem_used_avg"] = 45.2 // GB
|
||||||
|
job.Footprint["cpu_load_avg"] = 0.87 // percentage
|
||||||
|
```
|
||||||
|
|
||||||
|
### Energy Metrics
|
||||||
|
|
||||||
|
Energy consumption is calculated from power metrics using the formula:
|
||||||
|
|
||||||
|
```
|
||||||
|
Energy (kWh) = (Power (W) × Duration (s) / 3600) / 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
For each energy metric:
|
||||||
|
```go
|
||||||
|
job.EnergyFootprint["acc_power"] = 12.5 // kWh
|
||||||
|
job.Energy = 150.2 // Total energy in kWh
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note**: Energy calculations for metrics with unit "energy" (Joules) are not yet implemented.
|
||||||
|
|
||||||
|
## Data Validation
|
||||||
|
|
||||||
|
### SanityChecks(job *schema.Job)
|
||||||
|
|
||||||
|
Validates job metadata before database insertion:
|
||||||
|
|
||||||
|
- Cluster exists in configuration
|
||||||
|
- Subcluster is valid (assigns if needed)
|
||||||
|
- Job state is valid
|
||||||
|
- Resources and user fields are populated
|
||||||
|
- Node counts and hardware thread counts are positive
|
||||||
|
- Resource count matches declared node count
|
||||||
|
|
||||||
|
## Normalization Utilities
|
||||||
|
|
||||||
|
The package includes utilities for normalizing metric values to appropriate SI prefixes:
|
||||||
|
|
||||||
|
### Normalize(avg float64, prefix string)
|
||||||
|
|
||||||
|
Adjusts values and SI prefixes for readability:
|
||||||
|
|
||||||
|
```go
|
||||||
|
factor, newPrefix := importer.Normalize(2048.0, "M")
|
||||||
|
// Converts 2048 MB → ~2.0 GB
|
||||||
|
// Returns: factor for conversion, "G"
|
||||||
|
```
|
||||||
|
|
||||||
|
This is useful for automatically scaling metrics (e.g., memory, storage) to human-readable units.
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- `github.com/ClusterCockpit/cc-backend/internal/repository` - Database operations
|
||||||
|
- `github.com/ClusterCockpit/cc-backend/pkg/archive` - Job archive access
|
||||||
|
- `github.com/ClusterCockpit/cc-lib/schema` - Job schema definitions
|
||||||
|
- `github.com/ClusterCockpit/cc-lib/ccLogger` - Logging
|
||||||
|
- `github.com/ClusterCockpit/cc-lib/ccUnits` - SI unit handling
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
- **InitDB**: Continues processing on individual job failures, logs errors, returns summary
|
||||||
|
- **HandleImportFlag**: Fails fast on first error, returns immediately
|
||||||
|
- Both functions log detailed error context for debugging
|
||||||
|
|
||||||
|
## Performance
|
||||||
|
|
||||||
|
- **Transaction Batching**: InitDB processes jobs in batches of 100 for optimal database performance
|
||||||
|
- **Tag Caching**: Tag IDs are cached during import to minimize database queries
|
||||||
|
- **Progress Reporting**: InitDB prints progress updates during bulk operations
|
||||||
@@ -1,7 +1,8 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package importer
|
package importer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -10,20 +11,34 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Import all jobs specified as `<path-to-meta.json>:<path-to-data.json>,...`
|
// HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.
|
||||||
|
//
|
||||||
|
// The flag format is: "<path-to-meta.json>:<path-to-data.json>[,<path-to-meta2.json>:<path-to-data2.json>,...]"
|
||||||
|
//
|
||||||
|
// For each job pair, this function:
|
||||||
|
// 1. Reads and validates the metadata JSON file (schema.Job)
|
||||||
|
// 2. Reads and validates the job data JSON file (schema.JobData)
|
||||||
|
// 3. Enriches the job with calculated footprints and energy metrics
|
||||||
|
// 4. Validates the job using SanityChecks()
|
||||||
|
// 5. Imports the job into the archive
|
||||||
|
// 6. Inserts the job into the database with associated tags
|
||||||
|
//
|
||||||
|
// Schema validation is performed if config.Keys.Validate is true.
|
||||||
|
//
|
||||||
|
// Returns an error if file reading, validation, enrichment, or database operations fail.
|
||||||
|
// The function stops processing on the first error encountered.
|
||||||
func HandleImportFlag(flag string) error {
|
func HandleImportFlag(flag string) error {
|
||||||
r := repository.GetJobRepository()
|
r := repository.GetJobRepository()
|
||||||
|
|
||||||
for _, pair := range strings.Split(flag, ",") {
|
for pair := range strings.SplitSeq(flag, ",") {
|
||||||
files := strings.Split(pair, ":")
|
files := strings.Split(pair, ":")
|
||||||
if len(files) != 2 {
|
if len(files) != 2 {
|
||||||
return fmt.Errorf("REPOSITORY/INIT > invalid import flag format")
|
return fmt.Errorf("REPOSITORY/INIT > invalid import flag format")
|
||||||
@@ -31,7 +46,7 @@ func HandleImportFlag(flag string) error {
|
|||||||
|
|
||||||
raw, err := os.ReadFile(files[0])
|
raw, err := os.ReadFile(files[0])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Error while reading metadata file for import")
|
cclog.Warn("Error while reading metadata file for import")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -42,15 +57,18 @@ func HandleImportFlag(flag string) error {
|
|||||||
}
|
}
|
||||||
dec := json.NewDecoder(bytes.NewReader(raw))
|
dec := json.NewDecoder(bytes.NewReader(raw))
|
||||||
dec.DisallowUnknownFields()
|
dec.DisallowUnknownFields()
|
||||||
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
|
job := schema.Job{
|
||||||
if err = dec.Decode(&jobMeta); err != nil {
|
Shared: "none",
|
||||||
log.Warn("Error while decoding raw json metadata for import")
|
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
|
||||||
|
}
|
||||||
|
if err = dec.Decode(&job); err != nil {
|
||||||
|
cclog.Warn("Error while decoding raw json metadata for import")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
raw, err = os.ReadFile(files[1])
|
raw, err = os.ReadFile(files[1])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Error while reading jobdata file for import")
|
cclog.Warn("Error while reading jobdata file for import")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -63,72 +81,41 @@ func HandleImportFlag(flag string) error {
|
|||||||
dec.DisallowUnknownFields()
|
dec.DisallowUnknownFields()
|
||||||
jobData := schema.JobData{}
|
jobData := schema.JobData{}
|
||||||
if err = dec.Decode(&jobData); err != nil {
|
if err = dec.Decode(&jobData); err != nil {
|
||||||
log.Warn("Error while decoding raw json jobdata for import")
|
cclog.Warn("Error while decoding raw json jobdata for import")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// checkJobData(&jobData)
|
job.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||||
|
|
||||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
if err = enrichJobMetadata(&job); err != nil {
|
||||||
|
cclog.Errorf("Error enriching job metadata: %v", err)
|
||||||
// if _, err = r.Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
|
return err
|
||||||
// if err != nil {
|
|
||||||
// log.Warn("Error while finding job in jobRepository")
|
|
||||||
// return err
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// return fmt.Errorf("REPOSITORY/INIT > a job with that jobId, cluster and startTime does already exist")
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
job := schema.Job{
|
|
||||||
BaseJob: jobMeta.BaseJob,
|
|
||||||
StartTime: time.Unix(jobMeta.StartTime, 0),
|
|
||||||
StartTimeUnix: jobMeta.StartTime,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Other metrics...
|
if err = SanityChecks(&job); err != nil {
|
||||||
job.LoadAvg = loadJobStat(&jobMeta, "cpu_load")
|
cclog.Warn("BaseJob SanityChecks failed")
|
||||||
job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any")
|
return err
|
||||||
job.MemUsedMax = loadJobStat(&jobMeta, "mem_used")
|
}
|
||||||
job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw")
|
|
||||||
job.NetBwAvg = loadJobStat(&jobMeta, "net_bw")
|
|
||||||
job.FileBwAvg = loadJobStat(&jobMeta, "file_bw")
|
|
||||||
|
|
||||||
job.RawResources, err = json.Marshal(job.Resources)
|
if err = archive.GetHandle().ImportJob(&job, &jobData); err != nil {
|
||||||
|
cclog.Error("Error while importing job")
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
id, err := r.InsertJobDirect(&job)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Error while marshaling job resources")
|
cclog.Warn("Error while job db insert")
|
||||||
return err
|
|
||||||
}
|
|
||||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
|
||||||
if err != nil {
|
|
||||||
log.Warn("Error while marshaling job metadata")
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err = SanityChecks(&job.BaseJob); err != nil {
|
|
||||||
log.Warn("BaseJob SanityChecks failed")
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err = archive.GetHandle().ImportJob(&jobMeta, &jobData); err != nil {
|
|
||||||
log.Error("Error while importing job")
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
id, err := r.InsertJob(&job)
|
|
||||||
if err != nil {
|
|
||||||
log.Warn("Error while job db insert")
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tag := range job.Tags {
|
for _, tag := range job.Tags {
|
||||||
if _, err := r.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
|
if err := r.ImportTag(id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||||
log.Error("Error while adding or creating tag")
|
cclog.Error("Error while adding or creating tag on import")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
|
cclog.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
package importer_test
|
package importer_test
|
||||||
@@ -16,9 +16,12 @@ import (
|
|||||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// copyFile copies a file from source path to destination path.
|
||||||
|
// Used by tests to set up test fixtures.
|
||||||
func copyFile(s string, d string) error {
|
func copyFile(s string, d string) error {
|
||||||
r, err := os.Open(s)
|
r, err := os.Open(s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -34,59 +37,40 @@ func copyFile(s string, d string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// setup initializes a test environment for importer tests.
|
||||||
|
//
|
||||||
|
// Creates a temporary directory with:
|
||||||
|
// - A test job archive with cluster configuration
|
||||||
|
// - A SQLite database initialized with schema
|
||||||
|
// - Configuration files loaded
|
||||||
|
//
|
||||||
|
// Returns a JobRepository instance for test assertions.
|
||||||
func setup(t *testing.T) *repository.JobRepository {
|
func setup(t *testing.T) *repository.JobRepository {
|
||||||
const testconfig = `{
|
const testconfig = `{
|
||||||
|
"main": {
|
||||||
"addr": "0.0.0.0:8080",
|
"addr": "0.0.0.0:8080",
|
||||||
"validate": false,
|
"validate": false,
|
||||||
|
"api-allowed-ips": [
|
||||||
|
"*"
|
||||||
|
]},
|
||||||
"archive": {
|
"archive": {
|
||||||
"kind": "file",
|
"kind": "file",
|
||||||
"path": "./var/job-archive"
|
"path": "./var/job-archive"
|
||||||
},
|
}
|
||||||
"jwts": {
|
}`
|
||||||
"max-age": "2m"
|
|
||||||
},
|
|
||||||
"clusters": [
|
|
||||||
{
|
|
||||||
"name": "testcluster",
|
|
||||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
|
||||||
"filterRanges": {
|
|
||||||
"numNodes": { "from": 1, "to": 64 },
|
|
||||||
"duration": { "from": 0, "to": 86400 },
|
|
||||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "fritz",
|
|
||||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
|
||||||
"filterRanges": {
|
|
||||||
"numNodes": { "from": 1, "to": 944 },
|
|
||||||
"duration": { "from": 0, "to": 86400 },
|
|
||||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "taurus",
|
|
||||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
|
||||||
"filterRanges": {
|
|
||||||
"numNodes": { "from": 1, "to": 4000 },
|
|
||||||
"duration": { "from": 0, "to": 604800 },
|
|
||||||
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]}`
|
|
||||||
|
|
||||||
log.Init("info", true)
|
cclog.Init("info", true)
|
||||||
tmpdir := t.TempDir()
|
tmpdir := t.TempDir()
|
||||||
|
|
||||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||||
if err := os.Mkdir(jobarchive, 0777); err != nil {
|
if err := os.Mkdir(jobarchive, 0o777); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
|
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
fritzArchive := filepath.Join(tmpdir, "job-archive", "fritz")
|
fritzArchive := filepath.Join(tmpdir, "job-archive", "fritz")
|
||||||
if err := os.Mkdir(fritzArchive, 0777); err != nil {
|
if err := os.Mkdir(fritzArchive, 0o777); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if err := copyFile(filepath.Join("testdata", "cluster-fritz.json"),
|
if err := copyFile(filepath.Join("testdata", "cluster-fritz.json"),
|
||||||
@@ -95,27 +79,36 @@ func setup(t *testing.T) *repository.JobRepository {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||||
err := repository.MigrateDB("sqlite3", dbfilepath)
|
err := repository.MigrateDB(dbfilepath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
||||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0666); err != nil {
|
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
config.Init(cfgFilePath)
|
ccconf.Init(cfgFilePath)
|
||||||
|
|
||||||
|
// Load and check main configuration
|
||||||
|
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||||
|
config.Init(cfg)
|
||||||
|
} else {
|
||||||
|
t.Fatal("Main configuration must be present")
|
||||||
|
}
|
||||||
|
|
||||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||||
|
|
||||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
repository.Connect("sqlite3", dbfilepath)
|
repository.Connect(dbfilepath)
|
||||||
return repository.GetJobRepository()
|
return repository.GetJobRepository()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Result represents the expected test result for job import verification.
|
||||||
type Result struct {
|
type Result struct {
|
||||||
JobId int64
|
JobId int64
|
||||||
Cluster string
|
Cluster string
|
||||||
@@ -123,6 +116,8 @@ type Result struct {
|
|||||||
Duration int32
|
Duration int32
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// readResult reads the expected test result from a golden file.
|
||||||
|
// Golden files contain the expected job attributes after import.
|
||||||
func readResult(t *testing.T, testname string) Result {
|
func readResult(t *testing.T, testname string) Result {
|
||||||
var r Result
|
var r Result
|
||||||
|
|
||||||
@@ -140,6 +135,13 @@ func readResult(t *testing.T, testname string) Result {
|
|||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestHandleImportFlag tests the HandleImportFlag function with various job import scenarios.
|
||||||
|
//
|
||||||
|
// The test uses golden files in testdata/ to verify that jobs are correctly:
|
||||||
|
// - Parsed from metadata and data JSON files
|
||||||
|
// - Enriched with footprints and energy metrics
|
||||||
|
// - Inserted into the database
|
||||||
|
// - Retrievable with correct attributes
|
||||||
func TestHandleImportFlag(t *testing.T) {
|
func TestHandleImportFlag(t *testing.T) {
|
||||||
r := setup(t)
|
r := setup(t)
|
||||||
|
|
||||||
|
|||||||
@@ -1,40 +1,68 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
// All rights reserved.
|
// All rights reserved. This file is part of cc-backend.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package importer provides functionality for importing job data into the ClusterCockpit database.
|
||||||
|
//
|
||||||
|
// The package supports two primary use cases:
|
||||||
|
// 1. Bulk database initialization from archived jobs via InitDB()
|
||||||
|
// 2. Individual job import from file pairs via HandleImportFlag()
|
||||||
|
//
|
||||||
|
// Both operations enrich job metadata by calculating footprints and energy metrics
|
||||||
|
// before persisting to the database.
|
||||||
package importer
|
package importer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Delete the tables "job", "tag" and "jobtag" from the database and
|
const (
|
||||||
// repopulate them using the jobs found in `archive`.
|
addTagQuery = "INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)"
|
||||||
|
setTagQuery = "INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)"
|
||||||
|
)
|
||||||
|
|
||||||
|
// InitDB reinitializes the job database from archived job data.
|
||||||
|
//
|
||||||
|
// This function performs the following operations:
|
||||||
|
// 1. Flushes existing job, tag, and jobtag tables
|
||||||
|
// 2. Iterates through all jobs in the archive
|
||||||
|
// 3. Enriches each job with calculated footprints and energy metrics
|
||||||
|
// 4. Inserts jobs and tags into the database in batched transactions
|
||||||
|
//
|
||||||
|
// Jobs are processed in batches of 100 for optimal performance. The function
|
||||||
|
// continues processing even if individual jobs fail, logging errors and
|
||||||
|
// returning a summary at the end.
|
||||||
|
//
|
||||||
|
// Returns an error if database initialization, transaction management, or
|
||||||
|
// critical operations fail. Individual job failures are logged but do not
|
||||||
|
// stop the overall import process.
|
||||||
func InitDB() error {
|
func InitDB() error {
|
||||||
r := repository.GetJobRepository()
|
r := repository.GetJobRepository()
|
||||||
if err := r.Flush(); err != nil {
|
if err := r.Flush(); err != nil {
|
||||||
log.Errorf("repository initDB(): %v", err)
|
cclog.Errorf("repository initDB(): %v", err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
starttime := time.Now()
|
starttime := time.Now()
|
||||||
log.Print("Building job table...")
|
cclog.Print("Building job table...")
|
||||||
|
|
||||||
t, err := r.TransactionInit()
|
t, err := r.TransactionInit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Error while initializing SQL transactions")
|
cclog.Warn("Error while initializing SQL transactions")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
tags := make(map[string]int64)
|
tags := make(map[string]int64)
|
||||||
|
|
||||||
// Not using log.Print because we want the line to end with `\r` and
|
// Not using cclog.Print because we want the line to end with `\r` and
|
||||||
// this function is only ever called when a special command line flag
|
// this function is only ever called when a special command line flag
|
||||||
// is passed anyways.
|
// is passed anyways.
|
||||||
fmt.Printf("%d jobs inserted...\r", 0)
|
fmt.Printf("%d jobs inserted...\r", 0)
|
||||||
@@ -46,92 +74,195 @@ func InitDB() error {
|
|||||||
for jobContainer := range ar.Iter(false) {
|
for jobContainer := range ar.Iter(false) {
|
||||||
|
|
||||||
jobMeta := jobContainer.Meta
|
jobMeta := jobContainer.Meta
|
||||||
|
if jobMeta == nil {
|
||||||
|
cclog.Warn("skipping job with nil metadata")
|
||||||
|
errorOccured++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// Bundle 100 inserts into one transaction for better performance
|
// Bundle 100 inserts into one transaction for better performance
|
||||||
if i%100 == 0 {
|
if i%100 == 0 {
|
||||||
r.TransactionCommit(t)
|
if i > 0 {
|
||||||
|
if err := t.Commit(); err != nil {
|
||||||
|
cclog.Errorf("transaction commit error: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Start a new transaction for the next batch
|
||||||
|
t, err = r.TransactionInit()
|
||||||
|
if err != nil {
|
||||||
|
cclog.Errorf("transaction init error: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
fmt.Printf("%d jobs inserted...\r", i)
|
fmt.Printf("%d jobs inserted...\r", i)
|
||||||
}
|
}
|
||||||
|
|
||||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||||
job := schema.Job{
|
|
||||||
BaseJob: jobMeta.BaseJob,
|
|
||||||
StartTime: time.Unix(jobMeta.StartTime, 0),
|
|
||||||
StartTimeUnix: jobMeta.StartTime,
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Other metrics...
|
if err := enrichJobMetadata(jobMeta); err != nil {
|
||||||
job.LoadAvg = loadJobStat(jobMeta, "cpu_load")
|
cclog.Errorf("repository initDB(): %v", err)
|
||||||
job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any")
|
|
||||||
job.MemUsedMax = loadJobStat(jobMeta, "mem_used")
|
|
||||||
job.MemBwAvg = loadJobStat(jobMeta, "mem_bw")
|
|
||||||
job.NetBwAvg = loadJobStat(jobMeta, "net_bw")
|
|
||||||
job.FileBwAvg = loadJobStat(jobMeta, "file_bw")
|
|
||||||
|
|
||||||
job.RawResources, err = json.Marshal(job.Resources)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("repository initDB(): %v", err)
|
|
||||||
errorOccured++
|
errorOccured++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
if err := SanityChecks(jobMeta); err != nil {
|
||||||
if err != nil {
|
cclog.Errorf("repository initDB(): %v", err)
|
||||||
log.Errorf("repository initDB(): %v", err)
|
|
||||||
errorOccured++
|
errorOccured++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := SanityChecks(&job.BaseJob); err != nil {
|
id, jobErr := r.TransactionAddNamed(t,
|
||||||
log.Errorf("repository initDB(): %v", err)
|
repository.NamedJobInsert, jobMeta)
|
||||||
|
if jobErr != nil {
|
||||||
|
cclog.Errorf("repository initDB(): %v", jobErr)
|
||||||
errorOccured++
|
errorOccured++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
id, err := r.TransactionAdd(t, job)
|
// Job successfully inserted, increment counter
|
||||||
if err != nil {
|
i += 1
|
||||||
log.Errorf("repository initDB(): %v", err)
|
|
||||||
errorOccured++
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tag := range job.Tags {
|
for _, tag := range jobMeta.Tags {
|
||||||
tagstr := tag.Name + ":" + tag.Type
|
tagstr := tag.Name + ":" + tag.Type
|
||||||
tagId, ok := tags[tagstr]
|
tagID, ok := tags[tagstr]
|
||||||
if !ok {
|
if !ok {
|
||||||
tagId, err = r.TransactionAddTag(t, tag)
|
var err error
|
||||||
|
tagID, err = r.TransactionAdd(t,
|
||||||
|
addTagQuery,
|
||||||
|
tag.Name, tag.Type)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error adding tag: %v", err)
|
cclog.Errorf("Error adding tag: %v", err)
|
||||||
errorOccured++
|
errorOccured++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
tags[tagstr] = tagId
|
tags[tagstr] = tagID
|
||||||
}
|
}
|
||||||
|
|
||||||
r.TransactionSetTag(t, id, tagId)
|
r.TransactionAdd(t,
|
||||||
}
|
setTagQuery,
|
||||||
|
id, tagID)
|
||||||
if err == nil {
|
|
||||||
i += 1
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if errorOccured > 0 {
|
if errorOccured > 0 {
|
||||||
log.Warnf("Error in import of %d jobs!", errorOccured)
|
cclog.Warnf("Error in import of %d jobs!", errorOccured)
|
||||||
}
|
}
|
||||||
|
|
||||||
r.TransactionEnd(t)
|
r.TransactionEnd(t)
|
||||||
log.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
|
cclog.Infof("A total of %d jobs have been registered in %.3f seconds.", i, time.Since(starttime).Seconds())
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function also sets the subcluster if necessary!
|
// enrichJobMetadata calculates and populates job footprints, energy metrics, and serialized fields.
|
||||||
func SanityChecks(job *schema.BaseJob) error {
|
//
|
||||||
|
// This function performs the following enrichment operations:
|
||||||
|
// 1. Calculates job footprint metrics based on the subcluster configuration
|
||||||
|
// 2. Computes energy footprint and total energy consumption in kWh
|
||||||
|
// 3. Marshals footprints, resources, and metadata into JSON for database storage
|
||||||
|
//
|
||||||
|
// The function expects the job's MonitoringStatus and SubCluster to be already set.
|
||||||
|
// Energy calculations convert power metrics (Watts) to energy (kWh) using the formula:
|
||||||
|
//
|
||||||
|
// Energy (kWh) = (Power (W) * Duration (s) / 3600) / 1000
|
||||||
|
//
|
||||||
|
// Returns an error if subcluster retrieval, metric indexing, or JSON marshaling fails.
|
||||||
|
func enrichJobMetadata(job *schema.Job) error {
|
||||||
|
sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Errorf("cannot get subcluster: %s", err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
job.Footprint = make(map[string]float64)
|
||||||
|
|
||||||
|
for _, fp := range sc.Footprint {
|
||||||
|
statType := "avg"
|
||||||
|
|
||||||
|
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
|
||||||
|
statType = sc.MetricConfig[i].Footprint
|
||||||
|
}
|
||||||
|
|
||||||
|
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||||
|
|
||||||
|
job.Footprint[name] = repository.LoadJobStat(job, fp, statType)
|
||||||
|
}
|
||||||
|
|
||||||
|
job.RawFootprint, err = json.Marshal(job.Footprint)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Warn("Error while marshaling job footprint")
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
job.EnergyFootprint = make(map[string]float64)
|
||||||
|
|
||||||
|
// Total Job Energy Outside Loop
|
||||||
|
totalEnergy := 0.0
|
||||||
|
for _, fp := range sc.EnergyFootprint {
|
||||||
|
// Always Init Metric Energy Inside Loop
|
||||||
|
metricEnergy := 0.0
|
||||||
|
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
|
||||||
|
// Note: For DB data, calculate and save as kWh
|
||||||
|
switch sc.MetricConfig[i].Energy {
|
||||||
|
case "energy": // this metric has energy as unit (Joules)
|
||||||
|
cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", job.JobID, job.Cluster, fp)
|
||||||
|
// FIXME: Needs sum as stats type
|
||||||
|
case "power": // this metric has power as unit (Watt)
|
||||||
|
// Energy: Power (in Watts) * Time (in Seconds)
|
||||||
|
// Unit: (W * (s / 3600)) / 1000 = kWh
|
||||||
|
// Round 2 Digits: round(Energy * 100) / 100
|
||||||
|
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
|
||||||
|
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
|
||||||
|
rawEnergy := ((repository.LoadJobStat(job, fp, "avg") * float64(job.NumNodes)) * (float64(job.Duration) / 3600.0)) / 1000.0
|
||||||
|
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, *job.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
job.EnergyFootprint[fp] = metricEnergy
|
||||||
|
totalEnergy += metricEnergy
|
||||||
|
}
|
||||||
|
|
||||||
|
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
|
||||||
|
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
|
||||||
|
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", *job.ID)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
job.RawResources, err = json.Marshal(job.Resources)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Warn("Error while marshaling job resources")
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Warn("Error while marshaling job metadata")
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SanityChecks validates job metadata and ensures cluster/subcluster configuration is valid.
|
||||||
|
//
|
||||||
|
// This function performs the following validations:
|
||||||
|
// 1. Verifies the cluster exists in the archive configuration
|
||||||
|
// 2. Assigns and validates the subcluster (may modify job.SubCluster)
|
||||||
|
// 3. Validates job state is a recognized value
|
||||||
|
// 4. Ensures resources and user fields are populated
|
||||||
|
// 5. Validates node counts and hardware thread counts are positive
|
||||||
|
// 6. Verifies the number of resources matches the declared node count
|
||||||
|
//
|
||||||
|
// The function may modify the job's SubCluster field if it needs to be assigned.
|
||||||
|
//
|
||||||
|
// Returns an error if any validation check fails.
|
||||||
|
func SanityChecks(job *schema.Job) error {
|
||||||
if c := archive.GetCluster(job.Cluster); c == nil {
|
if c := archive.GetCluster(job.Cluster); c == nil {
|
||||||
return fmt.Errorf("no such cluster: %v", job.Cluster)
|
return fmt.Errorf("no such cluster: %v", job.Cluster)
|
||||||
}
|
}
|
||||||
if err := archive.AssignSubCluster(job); err != nil {
|
if err := archive.AssignSubCluster(job); err != nil {
|
||||||
log.Warn("Error while assigning subcluster to job")
|
cclog.Warn("Error while assigning subcluster to job")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if !job.State.Valid() {
|
if !job.State.Valid() {
|
||||||
@@ -150,18 +281,14 @@ func SanityChecks(job *schema.BaseJob) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadJobStat(job *schema.JobMeta, metric string) float64 {
|
// checkJobData normalizes metric units in job data based on average values.
|
||||||
if stats, ok := job.Statistics[metric]; ok {
|
//
|
||||||
if metric == "mem_used" {
|
// NOTE: This function is currently unused and contains incomplete implementation.
|
||||||
return stats.Max
|
// It was intended to normalize byte and file-related metrics to appropriate SI prefixes,
|
||||||
} else {
|
// but the normalization logic is commented out. Consider removing or completing this
|
||||||
return stats.Avg
|
// function based on project requirements.
|
||||||
}
|
//
|
||||||
}
|
// TODO: Either implement the metric normalization or remove this dead code.
|
||||||
|
|
||||||
return 0.0
|
|
||||||
}
|
|
||||||
|
|
||||||
func checkJobData(d *schema.JobData) error {
|
func checkJobData(d *schema.JobData) error {
|
||||||
for _, scopes := range *d {
|
for _, scopes := range *d {
|
||||||
// var newUnit schema.Unit
|
// var newUnit schema.Unit
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user