mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-01-15 09:11:45 +01:00
Compare commits
987 Commits
v1.3.0
...
upstream-t
| Author | SHA1 | Date | |
|---|---|---|---|
| d12da655e9 | |||
| 50df63a2d2 | |||
| d23f20f42a | |||
| 965561956e | |||
| 5a65044caf | |||
| 1cd4a57bd3 | |||
| b35172e2f7 | |||
| 3cfcd30128 | |||
| e56532e5c8 | |||
| fdee4f8938 | |||
|
|
7acc89e42d | ||
|
|
af7d208c21 | ||
|
|
91b90d033e | ||
|
|
7a0975b94d | ||
|
|
c58b01a602 | ||
|
|
8244449646 | ||
|
|
436afa4a61 | ||
| 06ed056d43 | |||
| d446c13546 | |||
| 6e74fa294a | |||
|
|
43bdb56072 | ||
| e707fd0893 | |||
|
|
19c8e9beb1 | ||
|
|
32e5353847 | ||
|
|
d2f2d78954 | ||
| b8fdfc30c0 | |||
| 79a2ca8ae8 | |||
| d1a78c13a4 | |||
| f4b00e9de1 | |||
| 0a5e155096 | |||
| 4ecc050c4c | |||
| 88dc5036b3 | |||
| d30c6ef3bf | |||
|
0419fec810
|
|||
|
43e5fd1131
|
|||
|
|
11e94124cc | ||
|
|
102109388b | ||
|
|
60a69aa0a2 | ||
| 5e2cbd75fa | |||
| 14f1192ccb | |||
| 72b2560ecf | |||
| 7fce6fa401 | |||
| e6286768a7 | |||
| 0306723307 | |||
| 6f49998ad3 | |||
| 457c944ec6 | |||
| 33c38f9464 | |||
| 46351389b6 | |||
|
|
d56b0e93db | ||
| d567a5312e | |||
| 97a322354f | |||
| 554527445b | |||
|
|
c5aff1a2ca | ||
| 987cc40318 | |||
| 104fd1576a | |||
| 72ce3954b4 | |||
| cfa7461855 | |||
| 44cda8a232 | |||
| cf119e6843 | |||
|
|
451744f321 | ||
|
|
ca6682b94b | ||
|
|
cbad2341c3 | ||
|
|
a956c7b135 | ||
|
|
ea6caeb2f0 | ||
|
|
c17e8b1156 | ||
|
|
b993b1e096 | ||
| d7d81e352d | |||
| 078c608bda | |||
| f2e57f9edd | |||
| 5698d5216f | |||
| 10aa2bfbd3 | |||
| 6cfed989ff | |||
| ab70acd582 | |||
|
|
79e1c236fe | ||
|
|
fed62b6c45 | ||
|
|
0d62181272 | ||
|
|
290a71bd48 | ||
|
|
6e385db378 | ||
|
|
ffe8329b84 | ||
| f13be109c2 | |||
| d24d85b970 | |||
| 8d44ac90ad | |||
|
|
4083de2a51 | ||
|
|
131df075db | ||
|
|
afd6f50ba2 | ||
|
|
ad01366705 | ||
| 6325793902 | |||
|
|
8ea176f9da | ||
| 03b5272e44 | |||
| 7da01975f7 | |||
| 7cff8bbfd2 | |||
|
|
c98cbb33f8 | ||
| f3ea95535b | |||
| b9b84b7971 | |||
| be7340ca30 | |||
| 881c4566dd | |||
| 7efbb0217f | |||
| 9e2ce39cde | |||
|
|
0ff6cae1c3 | ||
|
|
d02ba3d717 | ||
| 6aa830adb6 | |||
| be6603cbb9 | |||
|
|
8d208929d5 | ||
|
|
cb0f96b737 | ||
|
|
83723ab050 | ||
|
|
3abaefa550 | ||
|
|
389010dbbd | ||
| 81fe2c043e | |||
| c76e9bb3fe | |||
| 48b68d3410 | |||
| 2b64b31393 | |||
| 2333068de7 | |||
| 78530029ef | |||
| 329b6e5640 | |||
|
|
967f0a3294 | ||
|
|
6eb779d359 | ||
|
|
414147177a | ||
|
|
3b37f3630c | ||
|
|
7c1a818582 | ||
|
|
c4cf7e9707 | ||
|
|
1ceb681521 | ||
|
|
443176a0d1 | ||
|
|
261905a364 | ||
| e00288b160 | |||
| f141ca926f | |||
| f7a0954213 | |||
|
|
da8d562eba | ||
|
|
399af8592c | ||
| 6239e7f19b | |||
| d0e1b7186c | |||
| fea3292f50 | |||
| 9973aa9ffa | |||
| 0b38a980d2 | |||
| 20838b6882 | |||
| 8f4ef1e274 | |||
| e1c7583670 | |||
| 39a2157d46 | |||
| dd63e7157a | |||
| 340efd7926 | |||
| ecc6194b57 | |||
|
|
90c3381954 | ||
|
|
21334c8026 | ||
|
|
cbdef6ce9e | ||
|
|
591cd9fd66 | ||
|
|
e8d2a45afb | ||
|
|
3b533938a6 | ||
|
|
9fe342a7e9 | ||
|
|
2152ced97a | ||
|
|
404be5f317 | ||
|
|
f56783a439 | ||
|
|
fb278182d3 | ||
|
|
c2c63d2f67 | ||
|
|
7f740455fe | ||
|
|
946b992746 | ||
|
|
a6c43e6f2f | ||
|
|
ecad52c18d | ||
|
|
e49e5a0474 | ||
|
|
9231b3cfca | ||
|
|
68e0159292 | ||
|
|
1a674590bf | ||
|
|
1ef47e7b3f | ||
|
|
214a2762df | ||
| cb5d06decd | |||
| 8555a88202 | |||
|
|
2287f4493a | ||
|
|
bb357f7cab | ||
|
|
d9b240cd2d | ||
|
|
bea5ee96d9 | ||
|
|
7d205fd526 | ||
|
|
c15b2a0cbb | ||
|
|
7ccba30a3d | ||
|
|
8091485588 | ||
|
|
1413f968d6 | ||
|
|
d1d1bb09e9 | ||
|
|
3c1a7e0171 | ||
|
|
0cb50f2f01 | ||
|
|
2287586700 | ||
|
|
ea7660ddb3 | ||
|
|
44e98e8f2f | ||
|
|
856ccbb969 | ||
|
|
0920286b4c | ||
|
|
f34e10cfd9 | ||
| ae5d202661 | |||
| bc43c844fc | |||
| 67be9aa27b | |||
| 047b997a22 | |||
| bac51891b7 | |||
|
|
714d6af7cd | ||
| 6efd6334bb | |||
| 91f4475d76 | |||
|
|
de309784b4 | ||
|
|
a623cf53f3 | ||
| 440cd59e50 | |||
| eefb6f6265 | |||
| f5e1226837 | |||
| 151f7e701f | |||
| 40398497c2 | |||
|
|
cda10788fb | ||
|
|
845905d9c8 | ||
| 89055506d6 | |||
|
|
5908ae7905 | ||
|
|
4131665284 | ||
|
|
6a43dfb0d7 | ||
| 3d38d78845 | |||
| 600f19ac80 | |||
|
|
0a3a664653 | ||
|
|
471ec1cd2e | ||
|
|
e296cd7ca0 | ||
|
|
31cfa8cd7c | ||
|
|
70fe8aa367 | ||
|
|
cc9dafac6f | ||
|
|
32429f1481 | ||
| 9485a463b8 | |||
| 35c6ab4a08 | |||
| e58b0fa015 | |||
| beb92967e5 | |||
| 015583f1cd | |||
| d40c54b802 | |||
| 647665b6b9 | |||
| 4fc78bc382 | |||
| 50d000e7e2 | |||
|
|
ad500c4bef | ||
|
|
916077c6f8 | ||
|
|
935fb238a4 | ||
|
|
d03e5b4562 | ||
|
|
05c45c6468 | ||
| 9020613a8b | |||
| be92d5943d | |||
|
|
b2368a0751 | ||
| 7948d5f773 | |||
|
|
1a16851ad0 | ||
|
|
810c14a839 | ||
|
|
df0e8eb228 | ||
| 79605c8a9e | |||
|
|
9b644119ae | ||
|
|
ffa9919019 | ||
|
55ca892f90
|
|||
|
eaca187032
|
|||
|
|
3b9d05cc6d | ||
| d00881de2e | |||
| d8e85cf75d | |||
| 39f21763e4 | |||
|
|
af43901ca3 | ||
|
|
62565b9ae2 | ||
|
|
bca176170c | ||
|
|
2a91ca0cff | ||
|
|
19a75554b0 | ||
|
|
58ae476a3e | ||
|
|
44d8254a0b | ||
|
|
bd2cdfcef2 | ||
| a50b832c2a | |||
|
|
10194105e3 | ||
|
|
b474288df7 | ||
|
|
f338209f32 | ||
|
|
bef832e45b | ||
|
|
71cfb4db77 | ||
| 86453e7e11 | |||
|
|
98b9f8e62d | ||
| 44cd8d258d | |||
| 764b65d094 | |||
|
|
4d2c64b012 | ||
|
|
35c0b0be58 | ||
|
|
7a54e2cfb3 | ||
|
|
54283f6d3c | ||
|
|
697acd1d88 | ||
|
|
5cdb80b4d6 | ||
|
|
e48ff8be73 | ||
|
|
096217eea6 | ||
|
|
ed5290be86 | ||
|
|
b036c3903c | ||
|
|
57b43b7b60 | ||
| ab1ddb7bd1 | |||
| 881f2f32f4 | |||
| 0754ba5292 | |||
|
|
743a89c3a2 | ||
|
|
6692c3ab7c | ||
|
|
c16a5fdac4 | ||
|
|
60ec7e54f5 | ||
| dd48f5ab87 | |||
|
|
db674ec31d | ||
|
|
48150ffc8b | ||
|
|
1ad80efab6 | ||
|
|
aa8789f8f8 | ||
|
|
56e3f2da5c | ||
|
|
a4104822e2 | ||
|
|
c13f386e3b | ||
| 4bd73450b5 | |||
| 64da28e814 | |||
| 639e1b9c6d | |||
|
|
63e828d2df | ||
|
|
b8c30b5703 | ||
|
|
805ea91fc2 | ||
|
|
c4c422da57 | ||
| 544fb35121 | |||
| 43edccb284 | |||
| 7531ba4b5c | |||
| 983aa592d8 | |||
| 8378784231 | |||
| dca25cc601 | |||
|
|
c8fe81cd80 | ||
| c0a4724f57 | |||
| 484c52d813 | |||
|
|
47843b2087 | ||
|
|
c3a6126799 | ||
|
|
e94b250541 | ||
|
|
db5f6c7540 | ||
|
|
79a6c9e90d | ||
| e2e67e3977 | |||
| 6c06450701 | |||
|
|
d7379a1af2 | ||
|
|
d731611e0c | ||
|
|
dceb92ba8e | ||
|
|
1e039cb1bf | ||
|
6f3e1ffbe3
|
|||
|
|
6a6dca3fce | ||
|
|
d6d92071bf | ||
|
|
d40657dc64 | ||
|
|
6dde2a1e59 | ||
|
|
b7823cec16 | ||
|
|
eabd7b8d51 | ||
|
|
27ec445e54 | ||
|
|
ad108b285f | ||
|
|
f471214ef7 | ||
|
|
a0190f8f40 | ||
| 82af984023 | |||
| 0373010497 | |||
|
|
c22d869aa7 | ||
| 87c93e90cd | |||
| 3d6dca9386 | |||
|
|
f946e7e6ab | ||
|
|
d50dfa5867 | ||
| 249128e011 | |||
| ca16a80b1f | |||
|
|
e789e7ba9b | ||
|
|
5048f7be14 | ||
|
|
0e3603f596 | ||
| 9cd4b3c1cc | |||
| 1d9aa75960 | |||
|
|
0a24ef70e0 | ||
| 3b5d3d671e | |||
| 7db83d216e | |||
| d1a7002422 | |||
| 1d8e7e072f | |||
|
7466fe7a34
|
|||
|
|
24cf5047da | ||
|
|
1f103e5ef5 | ||
|
|
9e87974eb1 | ||
|
|
d806cf76c4 | ||
|
|
6e2703998d | ||
| 6f9737c2c2 | |||
|
|
5e696c10d5 | ||
|
|
927e25c72c | ||
| 8b1b99ba35 | |||
| 2c102cd1ff | |||
|
|
42c4926c47 | ||
|
|
703556d893 | ||
|
|
0b529a5c3c | ||
|
|
5186b3f61e | ||
| 4dc0da5099 | |||
| 1bad6ba065 | |||
| 3efee22536 | |||
| eef48ac3a3 | |||
| e35cfbc3dd | |||
| 4a5fd96b32 | |||
|
|
bdffe73f59 | ||
| cdfe722457 | |||
| 0aecea6de2 | |||
| 5a88c77171 | |||
| 8003217092 | |||
| 9b325041c1 | |||
| 1e7fbe5d56 | |||
| 0261c263f9 | |||
| 8d6ae85b0d | |||
| f14bdb3068 | |||
| 3c66840f95 | |||
| 733e3ea9d5 | |||
|
ca634bb707
|
|||
| 9abc206d1a | |||
| 85f17c0fd8 | |||
| 14bad81b9f | |||
|
|
ffd596e2c7 | ||
| 99f8187092 | |||
| f30b784f45 | |||
| f06b5f8fc0 | |||
| 2e781b900d | |||
| d76b1ae75d | |||
| 40110580e0 | |||
| eab7961a83 | |||
| 432e06e801 | |||
| fe1ff5c7a3 | |||
| 6e66b8e08b | |||
| 7abdd0545e | |||
|
|
3f1768e467 | ||
|
|
f464921ae3 | ||
|
|
7603ad3fb0 | ||
|
|
be7ccc78b8 | ||
|
|
b3135c982f | ||
|
13386175f5
|
|||
|
23e8f3dc2d
|
|||
|
|
b323ce2eef | ||
|
|
08e323ba51 | ||
|
|
9f50f36b1d | ||
|
|
4399c1d590 | ||
|
|
f7376f6dca | ||
|
|
518cb34340 | ||
|
|
f210a5f508 | ||
|
|
9ebc49dd1c | ||
|
|
c119eeb468 | ||
|
|
ab616f8f79 | ||
|
|
69286881e4 | ||
|
|
4419df8d1b | ||
|
|
aed2bd48fc | ||
|
|
d3d752f90c | ||
|
|
33ecfe88ef | ||
|
|
fd52fdd35b | ||
|
|
1d13d3dccf | ||
|
|
1c84bcae35 | ||
|
|
df497d5952 | ||
|
|
f65e122f8d | ||
| 161f0744aa | |||
| 95de9ad3b3 | |||
|
|
d5c170055f | ||
|
|
61f0521072 | ||
|
|
6ca14c55f2 | ||
|
|
1309d09aee | ||
| aba75b3a19 | |||
|
|
e87481d8db | ||
| acaad69917 | |||
|
|
ff588ad57a | ||
| 65df27154c | |||
| 8dfa1957f4 | |||
| 570eba3794 | |||
| 94a39fc61f | |||
| 2d359e5f99 | |||
|
|
04692e0c44 | ||
|
|
809fd23b88 | ||
|
|
e3653daea3 | ||
|
|
48fa75386c | ||
|
|
1b3a12a4dc | ||
|
|
543ddf540e | ||
|
|
a3fb471546 | ||
|
|
277f964b30 | ||
|
|
9bcf7adb67 | ||
|
|
f343fa0071 | ||
|
|
e5862e9218 | ||
|
|
29ae2423f8 | ||
|
|
1755a4a7df | ||
|
|
25d3325049 | ||
|
|
fb6a4c3b87 | ||
| 317f80a984 | |||
| 28cdc1d9e5 | |||
| c2087b15d5 | |||
| a8d785beb3 | |||
|
|
a6784b5549 | ||
|
|
d770292be8 | ||
|
|
b3a1037ade | ||
|
|
02946cf0b4 | ||
|
|
cf051d5108 | ||
|
|
96977c6183 | ||
|
|
73d83164fc | ||
|
|
1064f5e4a8 | ||
|
|
5be98c7087 | ||
|
|
0d689c7dff | ||
|
|
1f24ed46a0 | ||
|
|
92b4159f9e | ||
|
|
5817b41e29 | ||
| d6b132e3a6 | |||
|
|
318f70f34c | ||
|
|
e41525d40a | ||
|
|
a102220e52 | ||
|
|
e9a214c5b2 | ||
|
|
c53f5eb144 | ||
|
|
9ed64e0388 | ||
|
|
93040d4629 | ||
|
|
0144ad43f5 | ||
|
|
8da2fc30c3 | ||
| 0e27ae7795 | |||
| 33c6cdb9fe | |||
|
|
73b7014469 | ||
| 25aaf55b93 | |||
| 6a7546c43b | |||
| 0adda4bf7b | |||
|
|
f5f36427a4 | ||
|
|
590bfd3a10 | ||
|
|
16db9bd1a2 | ||
|
|
d0af933b35 | ||
|
|
2b56b40e6d | ||
|
|
4b2d7068b3 | ||
|
|
bd93b8be8e | ||
|
|
aa3fe2b872 | ||
|
|
a61ff915ac | ||
|
|
0a3e678329 | ||
|
|
d4336b0dcb | ||
|
|
65d2698af4 | ||
|
|
6454576417 | ||
|
|
a485bd5977 | ||
|
|
e733688fd0 | ||
|
|
e86f6a8cbd | ||
|
|
fcc9e17664 | ||
|
|
5c9d4ffa9a | ||
|
|
419bc2747b | ||
|
|
1ee99d6866 | ||
|
|
3ab8973895 | ||
|
|
acfa3baeb5 | ||
|
|
c21d7cf101 | ||
|
|
ec895e1d9e | ||
|
|
c964f09a4f | ||
|
|
0bc32f27df | ||
|
|
6640e93ce9 | ||
|
|
d7aefe0cf0 | ||
|
|
187fe5b361 | ||
|
|
b31aea7bc5 | ||
|
c661baf058
|
|||
|
|
0fe0461340 | ||
|
|
d5394c9e92 | ||
|
|
42135fd26c | ||
|
|
38569f55c7 | ||
|
|
5ce03c2db3 | ||
|
|
1031b3eb79 | ||
|
|
fcdf4cd476 | ||
| 6268dffff8 | |||
| c10737bfd7 | |||
|
|
bd0cc69668 | ||
|
|
84fffac264 | ||
|
|
5bf968010e | ||
|
|
61bc095d01 | ||
|
|
e376f97547 | ||
|
|
f2428d3cb3 | ||
|
|
2fdac85d31 | ||
|
|
b731395689 | ||
|
|
07405e3466 | ||
|
|
fc0c76bd77 | ||
|
|
d209547968 | ||
| 632b9fc5ea | |||
| 702591b4ec | |||
|
|
c562746e5f | ||
|
|
c0443cbec2 | ||
|
|
0191bc3821 | ||
|
|
633bd42036 | ||
|
|
998ef8d834 | ||
|
|
c25b076ca9 | ||
|
|
f43379f365 | ||
|
|
d902c0acf4 | ||
|
|
58e678d72c | ||
|
|
cbc49669d0 | ||
|
|
78bb638fd6 | ||
|
|
7a61bae471 | ||
|
|
e1b992526e | ||
|
|
1b043838ea | ||
|
|
07e72294dc | ||
|
|
b6b37ee68b | ||
|
|
43cb1f1bff | ||
|
|
f7a67c72bf | ||
|
|
c5476d08fa | ||
|
|
8af92b1557 | ||
|
|
eaa826bb8a | ||
|
|
140b3c371d | ||
|
|
f158eaa29c | ||
|
|
c4b98ade53 | ||
|
|
f2e85306ca | ||
|
|
42b9de8360 | ||
|
|
6c244f3121 | ||
|
|
9f56213d2f | ||
|
|
fb2f7cf680 | ||
|
|
8fcdd24f84 | ||
|
|
aaafde4a7c | ||
|
|
2b23003556 | ||
|
|
5681062f01 | ||
|
|
d61bf212f5 | ||
|
|
2bd7c8d51e | ||
|
|
1e63cdbcda | ||
|
|
86d85f12be | ||
|
|
dd470d49ec | ||
|
|
95d8062b00 | ||
|
|
8f82399214 | ||
|
|
6247150e9c | ||
|
5266644725
|
|||
|
81d9e96552
|
|||
|
|
4ec9f06114 | ||
|
0033e9f6c0
|
|||
|
571652c314
|
|||
|
|
7ec233e18a | ||
|
|
13c9a12336 | ||
|
|
83d472ecd6 | ||
|
|
c21da6512a | ||
|
|
4b4374e0df | ||
|
|
407276a04d | ||
|
|
64f60905b4 | ||
|
|
9e6072fed2 | ||
|
|
a3e5c424fd | ||
|
|
6683a350aa | ||
|
|
05bfa9b546 | ||
|
|
735988decb | ||
|
|
d0580592be | ||
|
|
817076bdbf | ||
|
|
736236e9ca | ||
|
|
3f4114c51b | ||
|
|
5c2c493c56 | ||
|
|
2c383ebea1 | ||
|
|
91e73450cf | ||
|
|
e55798944e | ||
|
|
5ea11a5ad2 | ||
|
|
2a3383e9e6 | ||
|
|
e871703724 | ||
|
|
1ee367d7be | ||
|
|
bce536b9b4 | ||
|
|
7c9182e0b0 | ||
|
|
aa915d639d | ||
|
|
9489ebc7d6 | ||
|
2a5c525193
|
|||
|
9e2d981c60
|
|||
|
|
53dfe9e4f5 | ||
|
48e95fbdb0
|
|||
|
fd94d85edf
|
|||
|
f2d1a85afb
|
|||
|
0bdbcb8bab
|
|||
|
|
7b91a819be | ||
| bc89025924 | |||
|
|
16bcaef4c3 | ||
|
|
fcbfa451f2 | ||
|
|
559ce53ca4 | ||
|
|
ee2c5b58d7 | ||
|
|
d98d998106 | ||
|
212c45e070
|
|||
|
143fa9b6ed
|
|||
|
4849928288
|
|||
|
|
9248ee8868 | ||
|
|
1616d96732 | ||
| 0bbedd1600 | |||
|
|
c7e49644d8 | ||
|
010c903c74
|
|||
|
e4d12e3537
|
|||
|
051cc8384e
|
|||
|
49a94170d2
|
|||
|
|
42e8e37bd4 | ||
|
|
5d2c350ce2 | ||
|
|
85dc0362c1 | ||
|
|
01c06728eb | ||
|
|
257250714d | ||
|
|
3b769c3059 | ||
|
|
a7395ed45b | ||
|
|
ab07c7928f | ||
|
|
b0c0d15505 | ||
|
|
fcf50790da | ||
|
|
1e43654607 | ||
|
|
4fecbe820d | ||
|
|
763c9dfa6b | ||
|
9de5879786
|
|||
|
|
9396e7492c | ||
|
3ac3415178
|
|||
|
1aae1c59d0
|
|||
|
907e80a01c
|
|||
|
|
8a10b69716 | ||
|
|
1a3cf7edd6 | ||
|
|
76d0fc979b | ||
|
|
a42d8ece35 | ||
|
|
93377f53fc | ||
|
|
c853d74ba0 | ||
|
|
0b9f74f4f4 | ||
|
|
5da6baf828 | ||
|
5766945006
|
|||
|
a53d473b58
|
|||
|
|
d1207ad80e | ||
|
|
e2efe71b33 | ||
|
|
2aef6ed9c0 | ||
|
|
fcb6db0603 | ||
| 01b1136316 | |||
|
|
2512fe9e75 | ||
|
|
f89b5cd2ec | ||
|
|
ab284ed208 | ||
|
|
00a578657c | ||
|
|
38ce40ae7d | ||
| e1be6c7138 | |||
| 28539e60b0 | |||
|
adb11b3ed0
|
|||
|
|
f1e6dedd44 | ||
|
|
8ea1454c06 | ||
| 81b8d578f2 | |||
|
|
16b11db39c | ||
| 0d923cc920 | |||
| c523e93564 | |||
| d588798ea1 | |||
| a11f165f2a | |||
|
|
d4f487d554 | ||
|
|
93d5a0e532 | ||
|
|
00ddc462d2 | ||
|
|
5f4a74f8ba | ||
|
|
a8eff6fbd1 | ||
|
|
baa7367ebe | ||
|
|
69f8a34aac | ||
|
|
21b3a67988 | ||
|
|
d89574ce73 | ||
| ddeac6b9d9 | |||
| 17906ec0eb | |||
|
|
311c088d3d | ||
| a2584d6083 | |||
| 35bd7739c6 | |||
| 7f43c88a39 | |||
|
|
fc1c54a141 | ||
|
|
2af111c584 | ||
| c093cca8b1 | |||
|
|
2bb1b78ba4 | ||
| 3ab26172c4 | |||
| cdd45ce88b | |||
|
210a7d3136
|
|||
|
92ec64d80f
|
|||
|
ff37f71fdb
|
|||
|
6056341525
|
|||
|
|
075612f5bd | ||
| 1a87ed8210 | |||
|
|
c05ffeb16d | ||
| ee3710c5ed | |||
| 4327c4b1f7 | |||
| 492e56a098 | |||
| f0257a2784 | |||
| ec1ead89ab | |||
|
|
ae53e87aba | ||
|
|
939dd2320a | ||
|
|
2c8b73e2e2 | ||
|
|
eabc6212ea | ||
|
|
c120d6517f | ||
|
|
597ee1dad7 | ||
|
|
c4a901504d | ||
|
|
f5cc5d07fd | ||
|
|
8a0e6c921c | ||
|
|
bf1bff9ace | ||
|
|
06f24e988f | ||
|
|
ae327f545e | ||
|
|
35012b18c5 | ||
|
|
9688bad622 | ||
|
|
447b8d3372 | ||
|
|
01102cb9b0 | ||
|
|
934d1a6114 | ||
|
|
6f74c8cb77 | ||
|
|
63b9e619a4 | ||
|
|
82e28f26d7 | ||
|
|
ca9fd96baa | ||
|
|
39b22267d6 | ||
|
|
60d7984d66 | ||
|
|
33d219d2ac | ||
|
|
85a77e05af | ||
|
|
3dfeabcec6 | ||
|
|
673fdc443c | ||
|
|
2f6e5a7648 | ||
|
|
2cbe8e9517 | ||
|
|
2f0460d6ec | ||
|
|
37f4ed7770 | ||
|
|
e3104c61cb | ||
|
|
bc434ee8cb | ||
|
|
f4102b948e | ||
|
|
ed991de11a | ||
|
|
322e161064 | ||
|
|
1adc741cc2 | ||
|
|
4eff87bbf7 | ||
|
|
fc6970d08a | ||
|
|
f616c7e1c6 | ||
|
|
89ec749172 | ||
|
|
182f0f2c64 | ||
|
|
e3681495ce | ||
|
|
37415fa261 | ||
|
|
7243dbe763 | ||
|
|
0ff5c4bedd | ||
|
|
f047f89ad5 | ||
|
|
0eb0aa1d3b | ||
|
|
6019891591 | ||
|
|
615281601c | ||
|
|
82baf5d384 | ||
|
|
6fe93ecb7e | ||
|
|
b3222f3523 | ||
|
|
3b94863521 | ||
|
|
582dc8bf46 | ||
|
|
a9868fd275 | ||
|
|
218e56576a | ||
|
|
c50e79375a | ||
|
|
dcb8308f35 | ||
|
|
183b310696 | ||
|
|
c7d0c86d52 | ||
|
|
48225662b1 | ||
|
|
f53fc088ec | ||
|
|
05517fcbcd | ||
|
|
18af51b0a4 | ||
|
|
ede3da7a87 | ||
|
|
8e3327ef6a | ||
|
|
827f6daabc | ||
|
|
2567442321 | ||
|
|
9cf5478519 | ||
|
|
e5275311c2 | ||
|
|
21e4870e4c | ||
|
|
beba7c8d2e | ||
|
|
fe35313305 | ||
|
|
d7a8bbf40b | ||
|
|
f1893c596e | ||
|
|
6367c1ab4d | ||
|
|
9579887fc4 | ||
|
|
e29be2f140 | ||
|
|
2736b5d1ef | ||
|
|
ff52fb16b6 | ||
|
|
ccbf3867e1 | ||
|
|
f0de422c6e | ||
|
|
64cc19b252 | ||
|
|
26226009f0 | ||
|
|
d10e09da02 | ||
|
|
00a2e58fee | ||
|
|
b1cb45dfe6 | ||
|
|
a2951d1f05 | ||
|
|
c0b1e97602 | ||
|
|
71621a9dc4 | ||
|
|
b3ed2afebe | ||
|
|
704620baff | ||
|
|
8feb805167 | ||
|
|
065b32755a | ||
|
|
1b5f4bff2c | ||
|
|
8e1c5a485f | ||
| 5fa6c9db35 | |||
| 5482b9be2c | |||
|
|
7400273b0a | ||
|
|
0b7cdde4a0 | ||
|
|
d5382aec4f | ||
|
|
df484dc816 | ||
|
|
7ea4086807 | ||
|
|
b04bf6a951 | ||
| 7c33dcf630 | |||
| 5e65e21f0b | |||
| 53ca38ce53 | |||
|
|
398e3c1b91 | ||
| 508978d586 | |||
| e267481f71 | |||
|
|
193bee5ac8 | ||
| f58efa2871 | |||
| 6568b6d723 | |||
|
|
4b1b34d8a7 | ||
| 39c09f8565 | |||
|
|
275a77807e | ||
|
|
6443541a79 | ||
|
|
5eb6f7d307 | ||
|
|
bce2a66177 | ||
|
|
7602641909 | ||
|
|
54f3a261c5 | ||
|
|
906bac965f | ||
|
|
4ec1de6900 | ||
|
|
8ded131666 | ||
| 47b14f932e | |||
|
|
838ebb3f69 | ||
| c459724114 | |||
| b0c9d1164d | |||
| 7c51d88501 | |||
| 5b03cf826b | |||
| f305863616 | |||
| db5809d522 | |||
|
|
83df6f015c | ||
| e7231b0e13 | |||
|
|
cff60eb51c | ||
|
f914a312f5
|
|||
| 56ebb301ca | |||
|
|
a59df12595 | ||
|
|
5cc7fc6ccb | ||
|
|
55027cb630 | ||
|
|
036eba68e1 | ||
|
|
d34e0d9348 | ||
|
|
31765ce0ef | ||
|
|
9fe7cdca92 | ||
|
|
adc3502b6b | ||
|
|
95fe369648 | ||
|
|
01845a0cb7 | ||
|
|
708eaf4178 | ||
|
|
d629a58712 | ||
|
|
90886b63d6 | ||
|
|
084f89fa32 | ||
|
|
ceb3a095d8 | ||
|
|
1758275f11 | ||
|
|
e74e506ffe | ||
|
|
599a36466a | ||
|
|
613e128cab | ||
|
|
e4f8022b7a | ||
|
|
5603c41900 | ||
| a8a27c9b51 | |||
|
|
b70de5a4be | ||
|
|
b1fd07cd30 | ||
|
|
6ab2e02fe6 | ||
|
|
5535c5780c | ||
|
|
49e0a2c055 | ||
|
|
efbe53b6b4 | ||
|
5e074dad10
|
|||
|
d6a88896d0
|
|||
|
5c99f5f8bb
|
|||
|
e1faba0ff2
|
|||
|
ba2f406bc0
|
|||
|
9b6db4684a
|
|||
|
|
561fd41d5d | ||
|
|
ce9995dac7 | ||
|
|
0afaea9513 | ||
|
|
9b5c6e3164 | ||
|
|
e6ebec8c1e | ||
|
|
2551921ed6 | ||
|
|
e02575aad7 | ||
|
|
ff3502c87a | ||
|
|
017f9b2140 | ||
|
|
c80d3a6958 | ||
|
|
3ca1127685 | ||
|
|
18369da5bc | ||
|
|
e65100cdc8 | ||
|
|
6a1cb51c2f | ||
|
c4d93e492b
|
|||
|
c2f72f72ac
|
|||
|
721b6b2afa
|
|||
|
b6f011c669
|
|||
|
801607fc16
|
|||
|
01a4d33514
|
|||
|
e348ec74fd
|
|||
|
0458675608
|
|||
|
c61ffce0e9
|
|||
|
68a97dc980
|
|||
|
a07d167390
|
|||
|
|
a8721dcc69 | ||
|
|
68cf952ac6 | ||
|
|
e14d6a81fe | ||
|
|
a4912893a8 | ||
|
0adfb631ef
|
|||
|
b64ce1f67f
|
|||
|
e8e3b1595d
|
|||
|
f1427d5272
|
|||
|
|
bf6b87d65c | ||
|
|
0240997257 | ||
|
|
f1e341f0b9 | ||
|
a54acb8c42
|
|||
|
c6ede67589
|
|||
|
|
11176da5d8 | ||
|
|
0a604336c4 | ||
|
|
be9df7649f | ||
|
|
63fb923995 | ||
|
|
3afe40083d | ||
|
|
9d4767539c | ||
|
ac9bba8b5b
|
|||
|
80c46bea7f
|
|||
|
|
614f694777 | ||
|
|
1072d7b449 | ||
|
1b70596735
|
|||
|
|
61eebc9fbd | ||
|
b05909969f
|
|||
|
bd89ce7cc9
|
|||
|
130613b717
|
|||
|
b3c1f39a0e
|
|||
|
97c807cd33
|
|||
|
aede5f71ec
|
|||
|
786770f56a
|
|||
|
|
74d4f00784 | ||
|
d61c4235dc
|
|||
|
e8794b8c79
|
|||
|
552da005dc
|
|||
|
|
51452d2e68 | ||
|
5c5484b4d2
|
|||
|
|
9974a851e8 | ||
| 6c0bfc6c35 | |||
|
|
41bbd203cc | ||
|
|
4344c26bef | ||
|
|
e1c1c06fb2 | ||
|
|
70e63764ff | ||
|
|
d10f3e3af6 | ||
|
|
a4397d5447 | ||
|
|
320c87a1db | ||
|
|
8d1228c9e8 | ||
|
|
420bec7c46 | ||
|
|
ba1658beac | ||
|
|
575753038b | ||
|
|
061c9f0979 | ||
|
|
b48d1b8ad6 | ||
| dff7aeefb8 | |||
| 54f7980162 | |||
|
|
684cb5a376 | ||
|
|
597bccc080 | ||
|
|
72557fd0bf | ||
|
|
0b2f2214f9 | ||
|
|
ef51e69ffb | ||
|
|
c9eb40f455 | ||
|
|
b66750339d | ||
|
|
136460567c | ||
|
|
f80123c85d | ||
|
|
a22340196f | ||
|
|
cbaeffde2c | ||
| 649d50812b | |||
| b67f5436f8 | |||
| b637ddeb28 | |||
| 2502989ca2 | |||
| ba7cc9168e | |||
| dc0d9fe038 | |||
| 0e6c6937cd | |||
| d839c53642 |
15
.github/dependabot.yml
vendored
Normal file
15
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
# To get started with Dependabot version updates, you'll need to specify which
|
||||
# package ecosystems to update and where the package manifests are located.
|
||||
# Please see the documentation for all configuration options:
|
||||
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
||||
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "gomod"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "npm"
|
||||
directory: "/web/frontend"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
331
.github/workflows/Release.yml
vendored
331
.github/workflows/Release.yml
vendored
@@ -1,331 +0,0 @@
|
||||
# See: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions
|
||||
|
||||
# Workflow name
|
||||
name: Release
|
||||
|
||||
# Run on tag push
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '**'
|
||||
|
||||
jobs:
|
||||
|
||||
#
|
||||
# Build on AlmaLinux 8.5 using golang-1.18.2
|
||||
#
|
||||
AlmaLinux-RPM-build:
|
||||
runs-on: ubuntu-latest
|
||||
# See: https://hub.docker.com/_/almalinux
|
||||
container: almalinux:8.5
|
||||
# The job outputs link to the outputs of the 'rpmrename' step
|
||||
# Only job outputs can be used in child jobs
|
||||
outputs:
|
||||
rpm : ${{steps.rpmrename.outputs.RPM}}
|
||||
srpm : ${{steps.rpmrename.outputs.SRPM}}
|
||||
steps:
|
||||
|
||||
# Use dnf to install development packages
|
||||
- name: Install development packages
|
||||
run: |
|
||||
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
|
||||
dnf --assumeyes install wget openssl-devel diffutils delve which npm
|
||||
dnf --assumeyes install 'dnf-command(builddep)'
|
||||
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
|
||||
# Use dnf to install build dependencies
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
|
||||
rpm -i go*.rpm
|
||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
||||
#dnf --assumeyes builddep build/package/cc-backend.spec
|
||||
|
||||
- name: RPM build ClusterCockpit
|
||||
id: rpmbuild
|
||||
run: make RPM
|
||||
|
||||
# AlmaLinux 8.5 is a derivate of RedHat Enterprise Linux 8 (UBI8),
|
||||
# so the created RPM both contain the substring 'el8' in the RPM file names
|
||||
# This step replaces the substring 'el8' to 'alma85'. It uses the move operation
|
||||
# because it is unclear whether the default AlmaLinux 8.5 container contains the
|
||||
# 'rename' command. This way we also get the new names for output.
|
||||
- name: Rename RPMs (s/el8/alma85/)
|
||||
id: rpmrename
|
||||
run: |
|
||||
OLD_RPM="${{steps.rpmbuild.outputs.RPM}}"
|
||||
OLD_SRPM="${{steps.rpmbuild.outputs.SRPM}}"
|
||||
NEW_RPM="${OLD_RPM/el8/alma85}"
|
||||
NEW_SRPM=${OLD_SRPM/el8/alma85}
|
||||
mv "${OLD_RPM}" "${NEW_RPM}"
|
||||
mv "${OLD_SRPM}" "${NEW_SRPM}"
|
||||
echo "::set-output name=SRPM::${NEW_SRPM}"
|
||||
echo "::set-output name=RPM::${NEW_RPM}"
|
||||
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
- name: Save RPM as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend RPM for AlmaLinux 8.5
|
||||
path: ${{ steps.rpmrename.outputs.RPM }}
|
||||
- name: Save SRPM as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend SRPM for AlmaLinux 8.5
|
||||
path: ${{ steps.rpmrename.outputs.SRPM }}
|
||||
|
||||
#
|
||||
# Build on UBI 8 using golang-1.18.2
|
||||
#
|
||||
UBI-8-RPM-build:
|
||||
runs-on: ubuntu-latest
|
||||
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
|
||||
container: registry.access.redhat.com/ubi8/ubi:8.5-226.1645809065
|
||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||
outputs:
|
||||
rpm : ${{steps.rpmbuild.outputs.RPM}}
|
||||
srpm : ${{steps.rpmbuild.outputs.SRPM}}
|
||||
steps:
|
||||
|
||||
# Use dnf to install development packages
|
||||
- name: Install development packages
|
||||
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
|
||||
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
|
||||
# Use dnf to install build dependencies
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
|
||||
rpm -i go*.rpm
|
||||
dnf --assumeyes --disableplugin=subscription-manager install npm
|
||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
||||
#dnf --assumeyes builddep build/package/cc-backend.spec
|
||||
|
||||
- name: RPM build ClusterCockpit
|
||||
id: rpmbuild
|
||||
run: make RPM
|
||||
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
- name: Save RPM as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend RPM for UBI 8
|
||||
path: ${{ steps.rpmbuild.outputs.RPM }}
|
||||
- name: Save SRPM as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend SRPM for UBI 8
|
||||
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
||||
|
||||
#
|
||||
# Build on Ubuntu 20.04 using official go 1.19.1 package
|
||||
#
|
||||
Ubuntu-focal-build:
|
||||
runs-on: ubuntu-latest
|
||||
container: ubuntu:20.04
|
||||
# The job outputs link to the outputs of the 'debrename' step
|
||||
# Only job outputs can be used in child jobs
|
||||
outputs:
|
||||
deb : ${{steps.debrename.outputs.DEB}}
|
||||
steps:
|
||||
# Use apt to install development packages
|
||||
- name: Install development packages
|
||||
run: |
|
||||
apt update && apt --assume-yes upgrade
|
||||
apt --assume-yes install build-essential sed git wget bash
|
||||
apt --assume-yes install npm
|
||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
# Use official golang package
|
||||
- name: Install Golang
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
|
||||
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
- name: DEB build ClusterCockpit
|
||||
id: dpkg-build
|
||||
run: |
|
||||
ls -la
|
||||
pwd
|
||||
env
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
git config --global --add safe.directory $(pwd)
|
||||
make DEB
|
||||
- name: Rename DEB (add '_ubuntu20.04')
|
||||
id: debrename
|
||||
run: |
|
||||
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
|
||||
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb"
|
||||
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
|
||||
echo "::set-output name=DEB::${NEW_DEB_FILE}"
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
- name: Save DEB as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend DEB for Ubuntu 20.04
|
||||
path: ${{ steps.debrename.outputs.DEB }}
|
||||
|
||||
#
|
||||
# Build on Ubuntu 20.04 using official go 1.19.1 package
|
||||
#
|
||||
Ubuntu-jammy-build:
|
||||
runs-on: ubuntu-latest
|
||||
container: ubuntu:22.04
|
||||
# The job outputs link to the outputs of the 'debrename' step
|
||||
# Only job outputs can be used in child jobs
|
||||
outputs:
|
||||
deb : ${{steps.debrename.outputs.DEB}}
|
||||
steps:
|
||||
# Use apt to install development packages
|
||||
- name: Install development packages
|
||||
run: |
|
||||
apt update && apt --assume-yes upgrade
|
||||
apt --assume-yes install build-essential sed git wget bash npm
|
||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
# Use official golang package
|
||||
- name: Install Golang
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
|
||||
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
- name: DEB build ClusterCockpit
|
||||
id: dpkg-build
|
||||
run: |
|
||||
ls -la
|
||||
pwd
|
||||
env
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
git config --global --add safe.directory $(pwd)
|
||||
make DEB
|
||||
- name: Rename DEB (add '_ubuntu22.04')
|
||||
id: debrename
|
||||
run: |
|
||||
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
|
||||
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu22.04.deb"
|
||||
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
|
||||
echo "::set-output name=DEB::${NEW_DEB_FILE}"
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
- name: Save DEB as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend DEB for Ubuntu 22.04
|
||||
path: ${{ steps.debrename.outputs.DEB }}
|
||||
|
||||
#
|
||||
# Create release with fresh RPMs
|
||||
#
|
||||
Release:
|
||||
runs-on: ubuntu-latest
|
||||
# We need the RPMs, so add dependency
|
||||
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build, Ubuntu-jammy-build]
|
||||
|
||||
steps:
|
||||
# See: https://github.com/actions/download-artifact
|
||||
- name: Download AlmaLinux 8.5 RPM
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend RPM for AlmaLinux 8.5
|
||||
- name: Download AlmaLinux 8.5 SRPM
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend SRPM for AlmaLinux 8.5
|
||||
|
||||
- name: Download UBI 8 RPM
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend RPM for UBI 8
|
||||
- name: Download UBI 8 SRPM
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend SRPM for UBI 8
|
||||
|
||||
- name: Download Ubuntu 20.04 DEB
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend DEB for Ubuntu 20.04
|
||||
|
||||
- name: Download Ubuntu 22.04 DEB
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend DEB for Ubuntu 22.04
|
||||
|
||||
# The download actions do not publish the name of the downloaded file,
|
||||
# so we re-use the job outputs of the parent jobs. The files are all
|
||||
# downloaded to the current folder.
|
||||
# The gh-release action afterwards does not accept file lists but all
|
||||
# files have to be listed at 'files'. The step creates one output per
|
||||
# RPM package (2 per distro)
|
||||
- name: Set RPM variables
|
||||
id: files
|
||||
run: |
|
||||
ALMA_85_RPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.rpm}}")
|
||||
ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}")
|
||||
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
|
||||
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
|
||||
U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}")
|
||||
U_2204_DEB=$(basename "${{ needs.Ubuntu-jammy-build.outputs.deb}}")
|
||||
echo "ALMA_85_RPM::${ALMA_85_RPM}"
|
||||
echo "ALMA_85_SRPM::${ALMA_85_SRPM}"
|
||||
echo "UBI_8_RPM::${UBI_8_RPM}"
|
||||
echo "UBI_8_SRPM::${UBI_8_SRPM}"
|
||||
echo "U_2004_DEB::${U_2004_DEB}"
|
||||
echo "U_2204_DEB::${U_2204_DEB}"
|
||||
echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}"
|
||||
echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}"
|
||||
echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}"
|
||||
echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}"
|
||||
echo "::set-output name=U_2004_DEB::${U_2004_DEB}"
|
||||
echo "::set-output name=U_2204_DEB::${U_2204_DEB}"
|
||||
|
||||
# See: https://github.com/softprops/action-gh-release
|
||||
- name: Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
with:
|
||||
name: cc-backend-${{github.ref_name}}
|
||||
files: |
|
||||
${{ steps.files.outputs.ALMA_85_RPM }}
|
||||
${{ steps.files.outputs.ALMA_85_SRPM }}
|
||||
${{ steps.files.outputs.UBI_8_RPM }}
|
||||
${{ steps.files.outputs.UBI_8_SRPM }}
|
||||
${{ steps.files.outputs.U_2004_DEB }}
|
||||
${{ steps.files.outputs.U_2204_DEB }}
|
||||
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -7,7 +7,7 @@ jobs:
|
||||
- name: Install Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.x
|
||||
go-version: 1.25.x
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
- name: Build, Vet & Test
|
||||
|
||||
29
.gitignore
vendored
29
.gitignore
vendored
@@ -1,19 +1,32 @@
|
||||
/cc-backend
|
||||
|
||||
/var/job-archive
|
||||
/var/*.db
|
||||
/var/machine-state
|
||||
|
||||
/.env
|
||||
/config.json
|
||||
/uiConfig.json
|
||||
|
||||
/var/job-archive
|
||||
/var/machine-state
|
||||
/var/*.db-shm
|
||||
/var/*.db-wal
|
||||
/var/*.db
|
||||
/var/*.txt
|
||||
|
||||
/var/checkpoints*
|
||||
|
||||
migrateTimestamps.pl
|
||||
test_ccms_write_api*
|
||||
|
||||
/web/frontend/public/build
|
||||
/web/frontend/node_modules
|
||||
/.vscode/*
|
||||
|
||||
/archive-migration
|
||||
/archive-manager
|
||||
var/job.db-shm
|
||||
var/job.db-wal
|
||||
|
||||
/internal/repository/testdata/job.db-shm
|
||||
/internal/repository/testdata/job.db-wal
|
||||
|
||||
/.vscode/*
|
||||
dist/
|
||||
*.db
|
||||
.idea
|
||||
tools/archive-migration/archive-migration
|
||||
tools/archive-manager/archive-manager
|
||||
|
||||
@@ -34,19 +34,6 @@ builds:
|
||||
main: ./tools/archive-manager
|
||||
tags:
|
||||
- static_build
|
||||
- env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
- linux
|
||||
goarch:
|
||||
- amd64
|
||||
goamd64:
|
||||
- v3
|
||||
id: "archive-migration"
|
||||
binary: archive-migration
|
||||
main: ./tools/archive-migration
|
||||
tags:
|
||||
- static_build
|
||||
- env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
@@ -70,7 +57,7 @@ archives:
|
||||
{{- else }}{{ .Arch }}{{ end }}
|
||||
{{- if .Arm }}v{{ .Arm }}{{ end }}
|
||||
checksum:
|
||||
name_template: 'checksums.txt'
|
||||
name_template: "checksums.txt"
|
||||
snapshot:
|
||||
name_template: "{{ incpatch .Version }}-next"
|
||||
changelog:
|
||||
@@ -100,7 +87,7 @@ changelog:
|
||||
release:
|
||||
draft: false
|
||||
footer: |
|
||||
Supports job archive version 1 and database version 6.
|
||||
Supports job archive version 2 and database version 8.
|
||||
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md) for further details on breaking changes.
|
||||
|
||||
# vim: set ts=2 sw=2 tw=0 fo=cnqoj
|
||||
|
||||
26
AGENTS.md
Normal file
26
AGENTS.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# ClusterCockpit Backend - Agent Guidelines
|
||||
|
||||
## Build/Test Commands
|
||||
|
||||
- Build: `make` or `go build ./cmd/cc-backend`
|
||||
- Run all tests: `make test` (runs: `go clean -testcache && go build ./... && go vet ./... && go test ./...`)
|
||||
- Run single test: `go test -run TestName ./path/to/package`
|
||||
- Run single test file: `go test ./path/to/package -run TestName`
|
||||
- Frontend build: `cd web/frontend && npm install && npm run build`
|
||||
- Generate GraphQL: `make graphql` (uses gqlgen)
|
||||
- Generate Swagger: `make swagger` (uses swaggo/swag)
|
||||
|
||||
## Code Style
|
||||
|
||||
- **Formatting**: Use `gofumpt` for all Go files (strict requirement)
|
||||
- **Copyright header**: All files must include copyright header (see existing files)
|
||||
- **Package docs**: Document packages with comprehensive package-level comments explaining purpose, usage, configuration
|
||||
- **Imports**: Standard library first, then external packages, then internal packages (grouped with blank lines)
|
||||
- **Naming**: Use camelCase for private, PascalCase for exported; descriptive names (e.g., `JobRepository`, `handleError`)
|
||||
- **Error handling**: Return errors, don't panic; use custom error types where appropriate; log with cclog package
|
||||
- **Logging**: Use `cclog` package (e.g., `cclog.Errorf()`, `cclog.Warnf()`, `cclog.Debugf()`)
|
||||
- **Testing**: Use standard `testing` package; use `testify/assert` for assertions; name tests `TestFunctionName`
|
||||
- **Comments**: Document all exported functions/types with godoc-style comments
|
||||
- **Structs**: Document fields with inline comments, especially for complex configurations
|
||||
- **HTTP handlers**: Return proper status codes; use `handleError()` helper for consistent error responses
|
||||
- **JSON**: Use struct tags for JSON marshaling; `DisallowUnknownFields()` for strict decoding
|
||||
215
CLAUDE.md
Normal file
215
CLAUDE.md
Normal file
@@ -0,0 +1,215 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with
|
||||
code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
ClusterCockpit is a job-specific performance monitoring framework for HPC
|
||||
clusters. This is a Golang backend that provides REST and GraphQL APIs, serves a
|
||||
Svelte-based frontend, and manages job archives and metric data from various
|
||||
time-series databases.
|
||||
|
||||
## Build and Development Commands
|
||||
|
||||
### Building
|
||||
|
||||
```bash
|
||||
# Build everything (frontend + backend)
|
||||
make
|
||||
|
||||
# Build only the frontend
|
||||
make frontend
|
||||
|
||||
# Build only the backend (requires frontend to be built first)
|
||||
go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.4.4 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
```bash
|
||||
# Run all tests
|
||||
make test
|
||||
|
||||
# Run tests with verbose output
|
||||
go test -v ./...
|
||||
|
||||
# Run tests for a specific package
|
||||
go test ./internal/repository
|
||||
```
|
||||
|
||||
### Code Generation
|
||||
|
||||
```bash
|
||||
# Regenerate GraphQL schema and resolvers (after modifying api/*.graphqls)
|
||||
make graphql
|
||||
|
||||
# Regenerate Swagger/OpenAPI docs (after modifying API comments)
|
||||
make swagger
|
||||
```
|
||||
|
||||
### Frontend Development
|
||||
|
||||
```bash
|
||||
cd web/frontend
|
||||
|
||||
# Install dependencies
|
||||
npm install
|
||||
|
||||
# Build for production
|
||||
npm run build
|
||||
|
||||
# Development mode with watch
|
||||
npm run dev
|
||||
```
|
||||
|
||||
### Running
|
||||
|
||||
```bash
|
||||
# Initialize database and create admin user
|
||||
./cc-backend -init-db -add-user demo:admin:demo
|
||||
|
||||
# Start server in development mode (enables GraphQL Playground and Swagger UI)
|
||||
./cc-backend -server -dev -loglevel info
|
||||
|
||||
# Start demo with sample data
|
||||
./startDemo.sh
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
### Backend Structure
|
||||
|
||||
The backend follows a layered architecture with clear separation of concerns:
|
||||
|
||||
- **cmd/cc-backend**: Entry point, orchestrates initialization of all subsystems
|
||||
- **internal/repository**: Data access layer using repository pattern
|
||||
- Abstracts database operations (SQLite3 only)
|
||||
- Implements LRU caching for performance
|
||||
- Provides repositories for Job, User, Node, and Tag entities
|
||||
- Transaction support for batch operations
|
||||
- **internal/api**: REST API endpoints (Swagger/OpenAPI documented)
|
||||
- **internal/graph**: GraphQL API (uses gqlgen)
|
||||
- Schema in `api/*.graphqls`
|
||||
- Generated code in `internal/graph/generated/`
|
||||
- Resolvers in `internal/graph/schema.resolvers.go`
|
||||
- **internal/auth**: Authentication layer
|
||||
- Supports local accounts, LDAP, OIDC, and JWT tokens
|
||||
- Implements rate limiting for login attempts
|
||||
- **internal/metricdata**: Metric data repository abstraction
|
||||
- Pluggable backends: cc-metric-store, Prometheus, InfluxDB
|
||||
- Each cluster can have a different metric data backend
|
||||
- **internal/archiver**: Job archiving to file-based archive
|
||||
- **pkg/archive**: Job archive backend implementations
|
||||
- File system backend (default)
|
||||
- S3 backend
|
||||
- SQLite backend (experimental)
|
||||
- **pkg/nats**: NATS integration for metric ingestion
|
||||
|
||||
### Frontend Structure
|
||||
|
||||
- **web/frontend**: Svelte 5 application
|
||||
- Uses Rollup for building
|
||||
- Components organized by feature (analysis, job, user, etc.)
|
||||
- GraphQL client using @urql/svelte
|
||||
- Bootstrap 5 + SvelteStrap for UI
|
||||
- uPlot for time-series visualization
|
||||
- **web/templates**: Server-side Go templates
|
||||
|
||||
### Key Concepts
|
||||
|
||||
**Job Archive**: Completed jobs are stored in a file-based archive following the
|
||||
[ClusterCockpit job-archive
|
||||
specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
|
||||
Each job has a `meta.json` file with metadata and metric data files.
|
||||
|
||||
**Metric Data Repositories**: Time-series metric data is stored separately from
|
||||
job metadata. The system supports multiple backends (cc-metric-store is
|
||||
recommended). Configuration is per-cluster in `config.json`.
|
||||
|
||||
**Authentication Flow**:
|
||||
|
||||
1. Multiple authenticators can be configured (local, LDAP, OIDC, JWT)
|
||||
2. Each authenticator's `CanLogin` method is called to determine if it should handle the request
|
||||
3. The first authenticator that returns true performs the actual `Login`
|
||||
4. JWT tokens are used for API authentication
|
||||
|
||||
**Database Migrations**: SQL migrations in `internal/repository/migrations/` are
|
||||
applied automatically on startup. Version tracking in `version` table.
|
||||
|
||||
**Scopes**: Metrics can be collected at different scopes:
|
||||
|
||||
- Node scope (always available)
|
||||
- Core scope (for jobs with ≤8 nodes)
|
||||
- Accelerator scope (for GPU/accelerator metrics)
|
||||
|
||||
## Configuration
|
||||
|
||||
- **config.json**: Main configuration (clusters, metric repositories, archive settings)
|
||||
- **.env**: Environment variables (secrets like JWT keys)
|
||||
- Copy from `configs/env-template.txt`
|
||||
- NEVER commit this file
|
||||
- **cluster.json**: Cluster topology and metric definitions (loaded from archive or config)
|
||||
|
||||
## Database
|
||||
|
||||
- Default: SQLite 3 (`./var/job.db`)
|
||||
- Connection managed by `internal/repository`
|
||||
- Schema version in `internal/repository/migration.go`
|
||||
|
||||
## Code Generation
|
||||
|
||||
**GraphQL** (gqlgen):
|
||||
|
||||
- Schema: `api/*.graphqls`
|
||||
- Config: `gqlgen.yml`
|
||||
- Generated code: `internal/graph/generated/`
|
||||
- Custom resolvers: `internal/graph/schema.resolvers.go`
|
||||
- Run `make graphql` after schema changes
|
||||
|
||||
**Swagger/OpenAPI**:
|
||||
|
||||
- Annotations in `internal/api/*.go`
|
||||
- Generated docs: `api/docs.go`, `api/swagger.yaml`
|
||||
- Run `make swagger` after API changes
|
||||
|
||||
## Testing Conventions
|
||||
|
||||
- Test files use `_test.go` suffix
|
||||
- Test data in `testdata/` subdirectories
|
||||
- Repository tests use in-memory SQLite
|
||||
- API tests use httptest
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### Adding a new GraphQL field
|
||||
|
||||
1. Edit schema in `api/*.graphqls`
|
||||
2. Run `make graphql`
|
||||
3. Implement resolver in `internal/graph/schema.resolvers.go`
|
||||
|
||||
### Adding a new REST endpoint
|
||||
|
||||
1. Add handler in `internal/api/*.go`
|
||||
2. Add route in `internal/api/rest.go`
|
||||
3. Add Swagger annotations
|
||||
4. Run `make swagger`
|
||||
|
||||
### Adding a new metric data backend
|
||||
|
||||
1. Implement `MetricDataRepository` interface in `internal/metricdata/`
|
||||
2. Register in `metricdata.Init()` switch statement
|
||||
3. Update config.json schema documentation
|
||||
|
||||
### Modifying database schema
|
||||
|
||||
1. Create new migration in `internal/repository/migrations/`
|
||||
2. Increment `repository.Version`
|
||||
3. Test with fresh database and existing database
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Go 1.24.0+ (check go.mod for exact version)
|
||||
- Node.js (for frontend builds)
|
||||
- SQLite 3 (only supported database)
|
||||
- Optional: NATS server for metric ingestion
|
||||
49
Makefile
49
Makefile
@@ -1,8 +1,6 @@
|
||||
TARGET = ./cc-backend
|
||||
VAR = ./var
|
||||
CFG = config.json .env
|
||||
FRONTEND = ./web/frontend
|
||||
VERSION = 1.3.0
|
||||
VERSION = 1.4.4
|
||||
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
||||
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
||||
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
||||
@@ -22,17 +20,27 @@ SVELTE_COMPONENTS = status \
|
||||
header
|
||||
|
||||
SVELTE_TARGETS = $(addprefix $(FRONTEND)/public/build/,$(addsuffix .js, $(SVELTE_COMPONENTS)))
|
||||
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/*.js) \
|
||||
$(wildcard $(FRONTEND)/src/filters/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/plots/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/joblist/*.svelte)
|
||||
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/*.js) \
|
||||
$(wildcard $(FRONTEND)/src/analysis/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/config/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/config/admin/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/config/user/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/*.js) \
|
||||
$(wildcard $(FRONTEND)/src/generic/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/filters/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/plots/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/joblist/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/helper/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/select/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/header/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/job/*.svelte)
|
||||
|
||||
.PHONY: clean distclean test tags frontend $(TARGET)
|
||||
.PHONY: clean distclean test tags frontend swagger graphql $(TARGET)
|
||||
|
||||
.NOTPARALLEL:
|
||||
|
||||
$(TARGET): $(VAR) $(CFG) $(SVELTE_TARGETS)
|
||||
$(TARGET): $(SVELTE_TARGETS)
|
||||
$(info ===> BUILD cc-backend)
|
||||
@go build -ldflags=${LD_FLAGS} ./cmd/cc-backend
|
||||
|
||||
@@ -40,6 +48,15 @@ frontend:
|
||||
$(info ===> BUILD frontend)
|
||||
cd web/frontend && npm install && npm run build
|
||||
|
||||
swagger:
|
||||
$(info ===> GENERATE swagger)
|
||||
@go tool github.com/swaggo/swag/cmd/swag init --parseDependency -d ./internal/api -g rest.go -o ./api
|
||||
@mv ./api/docs.go ./internal/api/docs.go
|
||||
|
||||
graphql:
|
||||
$(info ===> GENERATE graphql)
|
||||
@go tool github.com/99designs/gqlgen
|
||||
|
||||
clean:
|
||||
$(info ===> CLEAN)
|
||||
@go clean
|
||||
@@ -49,7 +66,7 @@ distclean:
|
||||
@$(MAKE) clean
|
||||
$(info ===> DISTCLEAN)
|
||||
@rm -rf $(FRONTEND)/node_modules
|
||||
@rm -rf $(VAR)
|
||||
@rm -rf ./var
|
||||
|
||||
test:
|
||||
$(info ===> TESTING)
|
||||
@@ -63,15 +80,7 @@ tags:
|
||||
@ctags -R
|
||||
|
||||
$(VAR):
|
||||
@mkdir $(VAR)
|
||||
|
||||
config.json:
|
||||
$(info ===> Initialize config.json file)
|
||||
@cp configs/config.json config.json
|
||||
|
||||
.env:
|
||||
$(info ===> Initialize .env file)
|
||||
@cp configs/env-template.txt .env
|
||||
@mkdir -p $(VAR)
|
||||
|
||||
$(SVELTE_TARGETS): $(SVELTE_SRC)
|
||||
$(info ===> BUILD frontend)
|
||||
|
||||
104
README.md
104
README.md
@@ -1,5 +1,8 @@
|
||||
# NOTE
|
||||
|
||||
While we do our best to keep the master branch in a usable state, there is no guarantee the master branch works.
|
||||
Please do not use it for production!
|
||||
|
||||
Please have a look at the [Release
|
||||
Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md)
|
||||
for breaking changes!
|
||||
@@ -26,12 +29,11 @@ is also served by the backend using [Svelte](https://svelte.dev/) components.
|
||||
Layout and styling are based on [Bootstrap 5](https://getbootstrap.com/) using
|
||||
[Bootstrap Icons](https://icons.getbootstrap.com/).
|
||||
|
||||
The backend uses [SQLite 3](https://sqlite.org/) as a relational SQL database by
|
||||
default. Optionally it can use a MySQL/MariaDB database server. While there are
|
||||
metric data backends for the InfluxDB and Prometheus time series databases, the
|
||||
only tested and supported setup is to use cc-metric-store as the metric data
|
||||
backend. Documentation on how to integrate ClusterCockpit with other time series
|
||||
databases will be added in the future.
|
||||
The backend uses [SQLite 3](https://sqlite.org/) as the relational SQL database.
|
||||
While there are metric data backends for the InfluxDB and Prometheus time series
|
||||
databases, the only tested and supported setup is to use cc-metric-store as the
|
||||
metric data backend. Documentation on how to integrate ClusterCockpit with other
|
||||
time series databases will be added in the future.
|
||||
|
||||
Completed batch jobs are stored in a file-based job archive according to
|
||||
[this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
|
||||
@@ -65,11 +67,11 @@ cd ./cc-backend
|
||||
./startDemo.sh
|
||||
```
|
||||
|
||||
You can also try the demo using the lates release binary.
|
||||
You can also try the demo using the latest release binary.
|
||||
Create a folder and put the release binary `cc-backend` into this folder.
|
||||
Execute the following steps:
|
||||
|
||||
``` shell
|
||||
```shell
|
||||
./cc-backend -init
|
||||
vim config.json (Add a second cluster entry and name the clusters alex and fritz)
|
||||
wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar
|
||||
@@ -88,9 +90,11 @@ Analysis, Systems and Status views).
|
||||
There is a Makefile to automate the build of cc-backend. The Makefile supports
|
||||
the following targets:
|
||||
|
||||
* `make`: Initialize `var` directory and build svelte frontend and backend binary. Note that there is no proper prerequesite handling. Any change of frontend source files will result in a complete rebuild.
|
||||
* `make clean`: Clean go build cache and remove binary.
|
||||
* `make test`: Run the tests that are also run in the GitHub workflow setup.
|
||||
- `make`: Initialize `var` directory and build svelte frontend and backend
|
||||
binary. Note that there is no proper prerequisite handling. Any change of
|
||||
frontend source files will result in a complete rebuild.
|
||||
- `make clean`: Clean go build cache and remove binary.
|
||||
- `make test`: Run the tests that are also run in the GitHub workflow setup.
|
||||
|
||||
A common workflow for setting up cc-backend from scratch is:
|
||||
|
||||
@@ -126,43 +130,41 @@ ln -s <your-existing-job-archive> ./var/job-archive
|
||||
|
||||
## Project file structure
|
||||
|
||||
* [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
|
||||
contains the API schema files for the REST and GraphQL APIs. The REST API is
|
||||
documented in the OpenAPI 3.0 format in
|
||||
[./api/openapi.yaml](./api/openapi.yaml).
|
||||
* [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
|
||||
contains `main.go` for the main application.
|
||||
* [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
|
||||
contains documentation about configuration and command line options and required
|
||||
environment variables. A sample configuration file is provided.
|
||||
* [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs)
|
||||
contains more in-depth documentation.
|
||||
* [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
|
||||
contains an example of setting up systemd for production use.
|
||||
* [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
|
||||
contains library source code that is not intended for use by others.
|
||||
* [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
|
||||
contains Go packages that can be used by other projects.
|
||||
* [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
|
||||
Additional command line helper tools.
|
||||
* [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
|
||||
Commands for getting infos about and existing job archive.
|
||||
* [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
|
||||
Tool to migrate from previous to current job archive version.
|
||||
* [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
|
||||
Tool to convert external pubkey for use in `cc-backend`.
|
||||
* [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
|
||||
contains a small application to generate a compatible JWT keypair. You find
|
||||
documentation on how to use it
|
||||
[here](https://github.com/ClusterCockpit/cc-backend/blob/master/docs/JWT-Handling.md).
|
||||
* [`web/`](https://github.com/ClusterCockpit/cc-backend/tree/master/web)
|
||||
Server-side templates and frontend-related files:
|
||||
* [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
|
||||
Svelte components and static assets for the frontend UI
|
||||
* [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
|
||||
Server-side Go templates
|
||||
* [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
|
||||
Configures the behaviour and generation of
|
||||
[gqlgen](https://github.com/99designs/gqlgen).
|
||||
* [`startDemo.sh`](https://github.com/ClusterCockpit/cc-backend/blob/master/startDemo.sh)
|
||||
is a shell script that sets up demo data, and builds and starts `cc-backend`.
|
||||
- [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
|
||||
contains the API schema files for the REST and GraphQL APIs. The REST API is
|
||||
documented in the OpenAPI 3.0 format in
|
||||
[./api/openapi.yaml](./api/openapi.yaml).
|
||||
- [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
|
||||
contains `main.go` for the main application.
|
||||
- [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
|
||||
contains documentation about configuration and command line options and required
|
||||
environment variables. A sample configuration file is provided.
|
||||
- [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs)
|
||||
contains more in-depth documentation.
|
||||
- [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
|
||||
contains an example of setting up systemd for production use.
|
||||
- [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
|
||||
contains library source code that is not intended for use by others.
|
||||
- [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
|
||||
contains Go packages that can be used by other projects.
|
||||
- [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
|
||||
Additional command line helper tools.
|
||||
- [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
|
||||
Commands for getting infos about and existing job archive.
|
||||
- [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
|
||||
Tool to convert external pubkey for use in `cc-backend`.
|
||||
- [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
|
||||
contains a small application to generate a compatible JWT keypair. You find
|
||||
documentation on how to use it
|
||||
[here](https://github.com/ClusterCockpit/cc-backend/blob/master/docs/JWT-Handling.md).
|
||||
- [`web/`](https://github.com/ClusterCockpit/cc-backend/tree/master/web)
|
||||
Server-side templates and frontend-related files:
|
||||
- [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
|
||||
Svelte components and static assets for the frontend UI
|
||||
- [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
|
||||
Server-side Go templates
|
||||
- [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
|
||||
Configures the behaviour and generation of
|
||||
[gqlgen](https://github.com/99designs/gqlgen).
|
||||
- [`startDemo.sh`](https://github.com/ClusterCockpit/cc-backend/blob/master/startDemo.sh)
|
||||
is a shell script that sets up demo data, and builds and starts `cc-backend`.
|
||||
|
||||
@@ -1,12 +1,47 @@
|
||||
# `cc-backend` version 1.3.0
|
||||
# `cc-backend` version 1.4.4
|
||||
|
||||
Supports job archive version 1 and database version 7.
|
||||
Supports job archive version 2 and database version 8.
|
||||
|
||||
This is a minor release of `cc-backend`, the API backend and frontend
|
||||
This is a bug fix release of `cc-backend`, the API backend and frontend
|
||||
implementation of ClusterCockpit.
|
||||
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
|
||||
|
||||
## Breaking changes
|
||||
|
||||
* This release fixes bugs in the MySQL/MariaDB database schema. For this reason
|
||||
you have to migrate your database using the `-migrate-db` switch.
|
||||
The option `apiAllowedIPs` is now a required configuration attribute in
|
||||
`config.json`. This option restricts access to the admin API.
|
||||
|
||||
To retain the previous behavior that the API is per default accessible from
|
||||
everywhere set:
|
||||
|
||||
```json
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
]
|
||||
```
|
||||
|
||||
## Breaking changes for minor release 1.4.x
|
||||
|
||||
- You need to perform a database migration. Depending on your database size the
|
||||
migration might require several hours!
|
||||
- You need to adapt the `cluster.json` configuration files in the job-archive,
|
||||
add new required attributes to the metric list and after that edit
|
||||
`./job-archive/version.txt` to version 2. Only metrics that have the footprint
|
||||
attribute set can be filtered and show up in the footprint UI and polar plot.
|
||||
- Continuous scrolling is default now in all job lists. You can change this back
|
||||
to paging globally, also every user can configure to use paging or continuous
|
||||
scrolling individually.
|
||||
- Tags have a scope now. Existing tags will get global scope in the database
|
||||
migration.
|
||||
|
||||
## New features
|
||||
|
||||
- Enable to delete tags from the web interface
|
||||
|
||||
## Known issues
|
||||
|
||||
- Currently energy footprint metrics of type energy are ignored for calculating
|
||||
total energy.
|
||||
- Resampling for running jobs only works with cc-metric-store
|
||||
- With energy footprint metrics of type power the unit is ignored and it is
|
||||
assumed the metric has the unit Watt.
|
||||
|
||||
@@ -4,138 +4,222 @@ scalar Any
|
||||
scalar NullableFloat
|
||||
scalar MetricScope
|
||||
scalar JobState
|
||||
scalar SchedulerState
|
||||
scalar MonitoringState
|
||||
|
||||
type Node {
|
||||
id: ID!
|
||||
hostname: String!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
jobsRunning: Int!
|
||||
cpusAllocated: Int
|
||||
memoryAllocated: Int
|
||||
gpusAllocated: Int
|
||||
schedulerState: SchedulerState!
|
||||
healthState: MonitoringState!
|
||||
metaData: Any
|
||||
}
|
||||
|
||||
type NodeStates {
|
||||
state: String!
|
||||
count: Int!
|
||||
}
|
||||
|
||||
type NodeStatesTimed {
|
||||
state: String!
|
||||
counts: [Int!]!
|
||||
times: [Int!]!
|
||||
}
|
||||
|
||||
type Job {
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
user: String!
|
||||
project: String!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
startTime: Time!
|
||||
duration: Int!
|
||||
walltime: Int!
|
||||
numNodes: Int!
|
||||
numHWThreads: Int!
|
||||
numAcc: Int!
|
||||
SMT: Int!
|
||||
exclusive: Int!
|
||||
partition: String!
|
||||
arrayJobId: Int!
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
user: String!
|
||||
project: String!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
startTime: Time!
|
||||
duration: Int!
|
||||
walltime: Int!
|
||||
numNodes: Int!
|
||||
numHWThreads: Int!
|
||||
numAcc: Int!
|
||||
energy: Float!
|
||||
SMT: Int!
|
||||
shared: String!
|
||||
partition: String!
|
||||
arrayJobId: Int!
|
||||
monitoringStatus: Int!
|
||||
state: JobState!
|
||||
tags: [Tag!]!
|
||||
resources: [Resource!]!
|
||||
concurrentJobs: JobLinkResultList
|
||||
|
||||
memUsedMax: Float
|
||||
flopsAnyAvg: Float
|
||||
memBwAvg: Float
|
||||
loadAvg: Float
|
||||
|
||||
metaData: Any
|
||||
userData: User
|
||||
state: JobState!
|
||||
tags: [Tag!]!
|
||||
resources: [Resource!]!
|
||||
concurrentJobs: JobLinkResultList
|
||||
footprint: [FootprintValue]
|
||||
energyFootprint: [EnergyFootprintValue]
|
||||
metaData: Any
|
||||
userData: User
|
||||
}
|
||||
|
||||
type JobLink {
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
}
|
||||
|
||||
type Cluster {
|
||||
name: String!
|
||||
partitions: [String!]! # Slurm partitions
|
||||
metricConfig: [MetricConfig!]!
|
||||
subClusters: [SubCluster!]! # Hardware partitions/subclusters
|
||||
name: String!
|
||||
partitions: [String!]! # Slurm partitions
|
||||
subClusters: [SubCluster!]! # Hardware partitions/subclusters
|
||||
}
|
||||
|
||||
type SubCluster {
|
||||
name: String!
|
||||
nodes: String!
|
||||
numberOfNodes: Int!
|
||||
processorType: String!
|
||||
socketsPerNode: Int!
|
||||
coresPerSocket: Int!
|
||||
threadsPerCore: Int!
|
||||
flopRateScalar: MetricValue!
|
||||
flopRateSimd: MetricValue!
|
||||
name: String!
|
||||
nodes: String!
|
||||
numberOfNodes: Int!
|
||||
processorType: String!
|
||||
socketsPerNode: Int!
|
||||
coresPerSocket: Int!
|
||||
threadsPerCore: Int!
|
||||
flopRateScalar: MetricValue!
|
||||
flopRateSimd: MetricValue!
|
||||
memoryBandwidth: MetricValue!
|
||||
topology: Topology!
|
||||
topology: Topology!
|
||||
metricConfig: [MetricConfig!]!
|
||||
footprint: [String!]!
|
||||
}
|
||||
|
||||
type FootprintValue {
|
||||
name: String!
|
||||
stat: String!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type EnergyFootprintValue {
|
||||
hardware: String!
|
||||
metric: String!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type MetricValue {
|
||||
name: String
|
||||
unit: Unit!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type Topology {
|
||||
node: [Int!]
|
||||
socket: [[Int!]!]
|
||||
node: [Int!]
|
||||
socket: [[Int!]!]
|
||||
memoryDomain: [[Int!]!]
|
||||
die: [[Int!]!]
|
||||
core: [[Int!]!]
|
||||
die: [[Int!]!]
|
||||
core: [[Int!]!]
|
||||
accelerators: [Accelerator!]
|
||||
}
|
||||
|
||||
type Accelerator {
|
||||
id: String!
|
||||
type: String!
|
||||
id: String!
|
||||
type: String!
|
||||
model: String!
|
||||
}
|
||||
|
||||
type SubClusterConfig {
|
||||
name: String!
|
||||
peak: Float
|
||||
normal: Float
|
||||
name: String!
|
||||
peak: Float
|
||||
normal: Float
|
||||
caution: Float
|
||||
alert: Float
|
||||
remove: Boolean
|
||||
alert: Float
|
||||
remove: Boolean
|
||||
}
|
||||
|
||||
type MetricConfig {
|
||||
name: String!
|
||||
unit: Unit!
|
||||
scope: MetricScope!
|
||||
name: String!
|
||||
unit: Unit!
|
||||
scope: MetricScope!
|
||||
aggregation: String!
|
||||
timestep: Int!
|
||||
peak: Float!
|
||||
normal: Float
|
||||
timestep: Int!
|
||||
peak: Float!
|
||||
normal: Float
|
||||
caution: Float!
|
||||
alert: Float!
|
||||
alert: Float!
|
||||
lowerIsBetter: Boolean
|
||||
subClusters: [SubClusterConfig!]!
|
||||
}
|
||||
|
||||
type Tag {
|
||||
id: ID!
|
||||
id: ID!
|
||||
type: String!
|
||||
name: String!
|
||||
scope: String!
|
||||
}
|
||||
|
||||
type Resource {
|
||||
hostname: String!
|
||||
hwthreads: [Int!]
|
||||
accelerators: [String!]
|
||||
hostname: String!
|
||||
hwthreads: [Int!]
|
||||
accelerators: [String!]
|
||||
configuration: String
|
||||
}
|
||||
|
||||
type JobMetricWithName {
|
||||
name: String!
|
||||
scope: MetricScope!
|
||||
name: String!
|
||||
scope: MetricScope!
|
||||
metric: JobMetric!
|
||||
}
|
||||
|
||||
type ClusterMetricWithName {
|
||||
name: String!
|
||||
unit: Unit
|
||||
timestep: Int!
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type JobMetric {
|
||||
unit: Unit
|
||||
timestep: Int!
|
||||
series: [Series!]
|
||||
unit: Unit
|
||||
timestep: Int!
|
||||
series: [Series!]
|
||||
statisticsSeries: StatsSeries
|
||||
}
|
||||
|
||||
type Series {
|
||||
hostname: String!
|
||||
id: String
|
||||
hostname: String!
|
||||
id: String
|
||||
statistics: MetricStatistics
|
||||
data: [NullableFloat!]!
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type StatsSeries {
|
||||
mean: [NullableFloat!]!
|
||||
median: [NullableFloat!]!
|
||||
min: [NullableFloat!]!
|
||||
max: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type NamedStatsWithScope {
|
||||
name: String!
|
||||
scope: MetricScope!
|
||||
stats: [ScopedStats!]!
|
||||
}
|
||||
|
||||
type ScopedStats {
|
||||
hostname: String!
|
||||
id: String
|
||||
data: MetricStatistics!
|
||||
}
|
||||
|
||||
type JobStats {
|
||||
id: Int!
|
||||
jobId: String!
|
||||
startTime: Int!
|
||||
duration: Int!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
numNodes: Int!
|
||||
numHWThreads: Int
|
||||
numAccelerators: Int
|
||||
stats: [NamedStats!]!
|
||||
}
|
||||
|
||||
type NamedStats {
|
||||
name: String!
|
||||
data: MetricStatistics!
|
||||
}
|
||||
|
||||
type Unit {
|
||||
@@ -149,20 +233,14 @@ type MetricStatistics {
|
||||
max: Float!
|
||||
}
|
||||
|
||||
type StatsSeries {
|
||||
mean: [NullableFloat!]!
|
||||
min: [NullableFloat!]!
|
||||
max: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type MetricFootprints {
|
||||
metric: String!
|
||||
data: [NullableFloat!]!
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type Footprints {
|
||||
timeWeights: TimeWeights!
|
||||
metrics: [MetricFootprints!]!
|
||||
metrics: [MetricFootprints!]!
|
||||
}
|
||||
|
||||
type TimeWeights {
|
||||
@@ -171,87 +249,221 @@ type TimeWeights {
|
||||
coreHours: [NullableFloat!]!
|
||||
}
|
||||
|
||||
enum Aggregate { USER, PROJECT, CLUSTER }
|
||||
enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS }
|
||||
enum Aggregate {
|
||||
USER
|
||||
PROJECT
|
||||
CLUSTER
|
||||
SUBCLUSTER
|
||||
}
|
||||
enum SortByAggregate {
|
||||
TOTALWALLTIME
|
||||
TOTALJOBS
|
||||
TOTALUSERS
|
||||
TOTALNODES
|
||||
TOTALNODEHOURS
|
||||
TOTALCORES
|
||||
TOTALCOREHOURS
|
||||
TOTALACCS
|
||||
TOTALACCHOURS
|
||||
}
|
||||
|
||||
type NodeMetrics {
|
||||
host: String!
|
||||
host: String!
|
||||
state: String!
|
||||
subCluster: String!
|
||||
metrics: [JobMetricWithName!]!
|
||||
metrics: [JobMetricWithName!]!
|
||||
}
|
||||
|
||||
type ClusterMetrics {
|
||||
nodeCount: Int!
|
||||
metrics: [ClusterMetricWithName!]!
|
||||
}
|
||||
|
||||
type NodesResultList {
|
||||
items: [NodeMetrics!]!
|
||||
offset: Int
|
||||
limit: Int
|
||||
count: Int
|
||||
totalNodes: Int
|
||||
hasNextPage: Boolean
|
||||
}
|
||||
|
||||
type ClusterSupport {
|
||||
cluster: String!
|
||||
subClusters: [String!]!
|
||||
}
|
||||
|
||||
type GlobalMetricListItem {
|
||||
name: String!
|
||||
unit: Unit!
|
||||
scope: MetricScope!
|
||||
footprint: String
|
||||
availability: [ClusterSupport!]!
|
||||
}
|
||||
|
||||
type Count {
|
||||
name: String!
|
||||
name: String!
|
||||
count: Int!
|
||||
}
|
||||
|
||||
type User {
|
||||
username: String!
|
||||
name: String!
|
||||
email: String!
|
||||
name: String!
|
||||
email: String!
|
||||
}
|
||||
|
||||
input MetricStatItem {
|
||||
metricName: String!
|
||||
range: FloatRange!
|
||||
}
|
||||
|
||||
type Query {
|
||||
clusters: [Cluster!]! # List of all clusters
|
||||
tags: [Tag!]! # List of all tags
|
||||
clusters: [Cluster!]! # List of all clusters
|
||||
tags: [Tag!]! # List of all tags
|
||||
globalMetrics: [GlobalMetricListItem!]!
|
||||
|
||||
user(username: String!): User
|
||||
allocatedNodes(cluster: String!): [Count!]!
|
||||
|
||||
## Node Queries New
|
||||
node(id: ID!): Node
|
||||
nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
|
||||
nodeStates(filter: [NodeFilter!]): [NodeStates!]!
|
||||
nodeStatesTimed(filter: [NodeFilter!], type: String!): [NodeStatesTimed!]!
|
||||
|
||||
job(id: ID!): Job
|
||||
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
|
||||
jobMetrics(
|
||||
id: ID!
|
||||
metrics: [String!]
|
||||
scopes: [MetricScope!]
|
||||
resolution: Int
|
||||
): [JobMetricWithName!]!
|
||||
|
||||
jobStats(id: ID!, metrics: [String!]): [NamedStats!]!
|
||||
|
||||
scopedJobStats(
|
||||
id: ID!
|
||||
metrics: [String!]
|
||||
scopes: [MetricScope!]
|
||||
): [NamedStatsWithScope!]!
|
||||
|
||||
jobs(
|
||||
filter: [JobFilter!]
|
||||
page: PageRequest
|
||||
order: OrderByInput
|
||||
): JobResultList!
|
||||
|
||||
jobsStatistics(
|
||||
filter: [JobFilter!]
|
||||
metrics: [String!]
|
||||
page: PageRequest
|
||||
sortBy: SortByAggregate
|
||||
groupBy: Aggregate
|
||||
numDurationBins: String
|
||||
numMetricBins: Int
|
||||
): [JobsStatistics!]!
|
||||
|
||||
jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]!
|
||||
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
|
||||
|
||||
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
|
||||
jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate): [JobsStatistics!]!
|
||||
rooflineHeatmap(
|
||||
filter: [JobFilter!]!
|
||||
rows: Int!
|
||||
cols: Int!
|
||||
minX: Float!
|
||||
minY: Float!
|
||||
maxX: Float!
|
||||
maxY: Float!
|
||||
): [[Float!]!]!
|
||||
|
||||
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
|
||||
nodeMetrics(
|
||||
cluster: String!
|
||||
nodes: [String!]
|
||||
scopes: [MetricScope!]
|
||||
metrics: [String!]
|
||||
from: Time!
|
||||
to: Time!
|
||||
): [NodeMetrics!]!
|
||||
|
||||
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
|
||||
nodeMetricsList(
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
stateFilter: String!
|
||||
nodeFilter: String!
|
||||
scopes: [MetricScope!]
|
||||
metrics: [String!]
|
||||
from: Time!
|
||||
to: Time!
|
||||
page: PageRequest
|
||||
resolution: Int
|
||||
): NodesResultList!
|
||||
|
||||
clusterMetrics(
|
||||
cluster: String!
|
||||
metrics: [String!]
|
||||
from: Time!
|
||||
to: Time!
|
||||
): ClusterMetrics!
|
||||
}
|
||||
|
||||
type Mutation {
|
||||
createTag(type: String!, name: String!): Tag!
|
||||
createTag(type: String!, name: String!, scope: String!): Tag!
|
||||
deleteTag(id: ID!): ID!
|
||||
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
removeTagFromList(tagIds: [ID!]!): [Int!]!
|
||||
|
||||
updateConfiguration(name: String!, value: String!): String
|
||||
}
|
||||
|
||||
type IntRangeOutput { from: Int!, to: Int! }
|
||||
type TimeRangeOutput { from: Time!, to: Time! }
|
||||
type IntRangeOutput {
|
||||
from: Int!
|
||||
to: Int!
|
||||
}
|
||||
type TimeRangeOutput {
|
||||
range: String
|
||||
from: Time!
|
||||
to: Time!
|
||||
}
|
||||
|
||||
input NodeFilter {
|
||||
hostname: StringInput
|
||||
cluster: StringInput
|
||||
subcluster: StringInput
|
||||
schedulerState: SchedulerState
|
||||
healthState: MonitoringState
|
||||
timeStart: Int
|
||||
}
|
||||
|
||||
input JobFilter {
|
||||
tags: [ID!]
|
||||
jobId: StringInput
|
||||
arrayJobId: Int
|
||||
user: StringInput
|
||||
project: StringInput
|
||||
jobName: StringInput
|
||||
cluster: StringInput
|
||||
partition: StringInput
|
||||
duration: IntRange
|
||||
tags: [ID!]
|
||||
dbId: [ID!]
|
||||
jobId: StringInput
|
||||
arrayJobId: Int
|
||||
user: StringInput
|
||||
project: StringInput
|
||||
jobName: StringInput
|
||||
cluster: StringInput
|
||||
partition: StringInput
|
||||
duration: IntRange
|
||||
energy: FloatRange
|
||||
|
||||
minRunningFor: Int
|
||||
|
||||
numNodes: IntRange
|
||||
numNodes: IntRange
|
||||
numAccelerators: IntRange
|
||||
numHWThreads: IntRange
|
||||
numHWThreads: IntRange
|
||||
|
||||
startTime: TimeRange
|
||||
state: [JobState!]
|
||||
flopsAnyAvg: FloatRange
|
||||
memBwAvg: FloatRange
|
||||
loadAvg: FloatRange
|
||||
memUsedMax: FloatRange
|
||||
|
||||
exclusive: Int
|
||||
node: StringInput
|
||||
startTime: TimeRange
|
||||
state: [JobState!]
|
||||
metricStats: [MetricStatItem!]
|
||||
shared: String
|
||||
node: StringInput
|
||||
}
|
||||
|
||||
input OrderByInput {
|
||||
field: String!
|
||||
type: String!
|
||||
order: SortDirectionEnum! = ASC
|
||||
}
|
||||
|
||||
@@ -261,30 +473,46 @@ enum SortDirectionEnum {
|
||||
}
|
||||
|
||||
input StringInput {
|
||||
eq: String
|
||||
neq: String
|
||||
contains: String
|
||||
eq: String
|
||||
neq: String
|
||||
contains: String
|
||||
startsWith: String
|
||||
endsWith: String
|
||||
in: [String!]
|
||||
endsWith: String
|
||||
in: [String!]
|
||||
}
|
||||
|
||||
input IntRange { from: Int!, to: Int! }
|
||||
input FloatRange { from: Float!, to: Float! }
|
||||
input TimeRange { from: Time, to: Time }
|
||||
input IntRange {
|
||||
from: Int!
|
||||
to: Int!
|
||||
}
|
||||
input TimeRange {
|
||||
range: String
|
||||
from: Time
|
||||
to: Time
|
||||
}
|
||||
|
||||
input FloatRange {
|
||||
from: Float!
|
||||
to: Float!
|
||||
}
|
||||
|
||||
type NodeStateResultList {
|
||||
items: [Node!]!
|
||||
count: Int
|
||||
}
|
||||
|
||||
type JobResultList {
|
||||
items: [Job!]!
|
||||
items: [Job!]!
|
||||
offset: Int
|
||||
limit: Int
|
||||
count: Int
|
||||
limit: Int
|
||||
count: Int
|
||||
hasNextPage: Boolean
|
||||
}
|
||||
|
||||
type JobLinkResultList {
|
||||
listQuery: String
|
||||
items: [JobLink!]!
|
||||
count: Int
|
||||
items: [JobLink!]!
|
||||
count: Int
|
||||
}
|
||||
|
||||
type HistoPoint {
|
||||
@@ -295,6 +523,7 @@ type HistoPoint {
|
||||
type MetricHistoPoints {
|
||||
metric: String!
|
||||
unit: String!
|
||||
stat: String
|
||||
data: [MetricHistoPoint!]
|
||||
}
|
||||
|
||||
@@ -305,27 +534,28 @@ type MetricHistoPoint {
|
||||
max: Int
|
||||
}
|
||||
|
||||
type JobsStatistics {
|
||||
id: ID! # If `groupBy` was used, ID of the user/project/cluster
|
||||
name: String! # if User-Statistics: Given Name of Account (ID) Owner
|
||||
totalJobs: Int! # Number of jobs
|
||||
runningJobs: Int! # Number of running jobs
|
||||
shortJobs: Int! # Number of jobs with a duration of less than duration
|
||||
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
|
||||
totalNodes: Int! # Sum of the nodes of all matched jobs
|
||||
totalNodeHours: Int! # Sum of the node hours of all matched jobs
|
||||
totalCores: Int! # Sum of the cores of all matched jobs
|
||||
totalCoreHours: Int! # Sum of the core hours of all matched jobs
|
||||
totalAccs: Int! # Sum of the accs of all matched jobs
|
||||
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
|
||||
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
|
||||
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
|
||||
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
|
||||
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
|
||||
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
|
||||
type JobsStatistics {
|
||||
id: ID! # If `groupBy` was used, ID of the user/project/cluster/subcluster
|
||||
name: String! # if User-Statistics: Given Name of Account (ID) Owner
|
||||
totalUsers: Int! # if *not* User-Statistics: Number of active users (based on running jobs)
|
||||
totalJobs: Int! # Number of jobs
|
||||
runningJobs: Int! # Number of running jobs
|
||||
shortJobs: Int! # Number of jobs with a duration of less than config'd ShortRunningJobsDuration
|
||||
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
|
||||
totalNodes: Int! # Sum of the nodes of all matched jobs
|
||||
totalNodeHours: Int! # Sum of the node hours of all matched jobs
|
||||
totalCores: Int! # Sum of the cores of all matched jobs
|
||||
totalCoreHours: Int! # Sum of the core hours of all matched jobs
|
||||
totalAccs: Int! # Sum of the accs of all matched jobs
|
||||
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
|
||||
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
|
||||
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
|
||||
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
|
||||
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
|
||||
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
|
||||
}
|
||||
|
||||
input PageRequest {
|
||||
itemsPerPage: Int!
|
||||
page: Int!
|
||||
page: Int!
|
||||
}
|
||||
|
||||
844
api/swagger.json
844
api/swagger.json
File diff suppressed because it is too large
Load Diff
663
api/swagger.yaml
663
api/swagger.yaml
File diff suppressed because it is too large
Load Diff
38
cmd/cc-backend/cli.go
Normal file
38
cmd/cc-backend/cli.go
Normal file
@@ -0,0 +1,38 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package main provides the entry point for the ClusterCockpit backend server.
|
||||
// This file defines all command-line flags and their default values.
|
||||
package main
|
||||
|
||||
import "flag"
|
||||
|
||||
var (
|
||||
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
|
||||
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags bool
|
||||
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
|
||||
)
|
||||
|
||||
func cliInit() {
|
||||
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize sqlite database file, config.json and .env")
|
||||
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
|
||||
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'hpc_user' table with ldap")
|
||||
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
|
||||
flag.BoolVar(&flagGops, "gops", false, "Listen via github.com/google/gops/agent (for debugging)")
|
||||
flag.BoolVar(&flagDev, "dev", false, "Enable development components: GraphQL Playground and Swagger UI")
|
||||
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
|
||||
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
|
||||
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
|
||||
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
|
||||
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
|
||||
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
|
||||
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
|
||||
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")
|
||||
flag.StringVar(&flagDelUser, "del-user", "", "Remove a existing user. Argument format: <username>")
|
||||
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
|
||||
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
|
||||
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info , warn (default), err, crit]`")
|
||||
flag.Parse()
|
||||
}
|
||||
119
cmd/cc-backend/init.go
Normal file
119
cmd/cc-backend/init.go
Normal file
@@ -0,0 +1,119 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package main provides the entry point for the ClusterCockpit backend server.
|
||||
// This file contains bootstrap logic for initializing the environment,
|
||||
// creating default configuration files, and setting up the database.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
const envString = `
|
||||
# Base64 encoded Ed25519 keys (DO NOT USE THESE TWO IN PRODUCTION!)
|
||||
# You can generate your own keypair using the gen-keypair tool
|
||||
JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
|
||||
JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
|
||||
|
||||
# Some random bytes used as secret for cookie-based sessions (DO NOT USE THIS ONE IN PRODUCTION)
|
||||
SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
|
||||
`
|
||||
|
||||
const configString = `
|
||||
{
|
||||
"main": {
|
||||
"addr": "127.0.0.1:8080",
|
||||
"short-running-jobs-duration": 300,
|
||||
"resampling": {
|
||||
"minimumPoints": 600,
|
||||
"trigger": 180,
|
||||
"resolutions": [
|
||||
240,
|
||||
60
|
||||
]
|
||||
},
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
],
|
||||
"emission-constant": 317
|
||||
},
|
||||
"cron": {
|
||||
"commit-job-worker": "2m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
}
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "name",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2023-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
`
|
||||
|
||||
func initEnv() {
|
||||
if util.CheckFileExists("var") {
|
||||
cclog.Exit("Directory ./var already exists. Cautiously exiting application initialization.")
|
||||
}
|
||||
|
||||
if err := os.WriteFile("config.json", []byte(configString), 0o666); err != nil {
|
||||
cclog.Abortf("Could not write default ./config.json with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
if err := os.WriteFile(".env", []byte(envString), 0o666); err != nil {
|
||||
cclog.Abortf("Could not write default ./.env file with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
if err := os.Mkdir("var", 0o777); err != nil {
|
||||
cclog.Abortf("Could not create default ./var folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
err := repository.MigrateDB("./var/job.db")
|
||||
if err != nil {
|
||||
cclog.Abortf("Could not initialize default SQLite database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
if err := os.Mkdir("var/job-archive", 0o777); err != nil {
|
||||
cclog.Abortf("Could not create default ./var/job-archive folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
archiveCfg := "{\"kind\": \"file\",\"path\": \"./var/job-archive\"}"
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
cclog.Abortf("Could not initialize job-archive, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
390
cmd/cc-backend/server.go
Normal file
390
cmd/cc-backend/server.go
Normal file
@@ -0,0 +1,390 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
// Package main provides the entry point for the ClusterCockpit backend server.
|
||||
// This file contains HTTP server setup, routing configuration, and
|
||||
// authentication middleware integration.
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql/handler"
|
||||
"github.com/99designs/gqlgen/graphql/handler/transport"
|
||||
"github.com/99designs/gqlgen/graphql/playground"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/nats"
|
||||
"github.com/ClusterCockpit/cc-backend/web"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/runtimeEnv"
|
||||
"github.com/gorilla/handlers"
|
||||
"github.com/gorilla/mux"
|
||||
httpSwagger "github.com/swaggo/http-swagger"
|
||||
)
|
||||
|
||||
var buildInfo web.Build
|
||||
|
||||
// Environment variable names
|
||||
const (
|
||||
envDebug = "DEBUG"
|
||||
)
|
||||
|
||||
// Server encapsulates the HTTP server state and dependencies
|
||||
type Server struct {
|
||||
router *mux.Router
|
||||
server *http.Server
|
||||
restAPIHandle *api.RestAPI
|
||||
natsAPIHandle *api.NatsAPI
|
||||
}
|
||||
|
||||
func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
json.NewEncoder(rw).Encode(map[string]string{
|
||||
"status": http.StatusText(http.StatusUnauthorized),
|
||||
"error": err.Error(),
|
||||
})
|
||||
}
|
||||
|
||||
// NewServer creates and initializes a new Server instance
|
||||
func NewServer(version, commit, buildDate string) (*Server, error) {
|
||||
buildInfo = web.Build{Version: version, Hash: commit, Buildtime: buildDate}
|
||||
|
||||
s := &Server{
|
||||
router: mux.NewRouter(),
|
||||
}
|
||||
|
||||
if err := s.init(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func (s *Server) init() error {
|
||||
// Setup the http.Handler/Router used by the server
|
||||
graph.Init()
|
||||
resolver := graph.GetResolverInstance()
|
||||
graphQLServer := handler.New(
|
||||
generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
|
||||
|
||||
graphQLServer.AddTransport(transport.POST{})
|
||||
|
||||
if os.Getenv(envDebug) != "1" {
|
||||
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
|
||||
// The problem with this is that then, no more stacktrace is printed to stderr.
|
||||
graphQLServer.SetRecoverFunc(func(ctx context.Context, err any) error {
|
||||
switch e := err.(type) {
|
||||
case string:
|
||||
return fmt.Errorf("MAIN > Panic: %s", e)
|
||||
case error:
|
||||
return fmt.Errorf("MAIN > Panic caused by: %s", e.Error())
|
||||
}
|
||||
|
||||
return errors.New("MAIN > Internal server error (panic)")
|
||||
})
|
||||
}
|
||||
|
||||
authHandle := auth.GetAuthInstance()
|
||||
|
||||
s.restAPIHandle = api.New()
|
||||
|
||||
info := map[string]any{}
|
||||
info["hasOpenIDConnect"] = false
|
||||
|
||||
if auth.Keys.OpenIDConfig != nil {
|
||||
openIDConnect := auth.NewOIDC(authHandle)
|
||||
openIDConnect.RegisterEndpoints(s.router)
|
||||
info["hasOpenIDConnect"] = true
|
||||
}
|
||||
|
||||
s.router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
cclog.Debugf("##%v##", info)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
|
||||
}).Methods(http.MethodGet)
|
||||
s.router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
|
||||
})
|
||||
s.router.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
|
||||
})
|
||||
|
||||
secured := s.router.PathPrefix("/").Subrouter()
|
||||
securedapi := s.router.PathPrefix("/api").Subrouter()
|
||||
userapi := s.router.PathPrefix("/userapi").Subrouter()
|
||||
configapi := s.router.PathPrefix("/config").Subrouter()
|
||||
frontendapi := s.router.PathPrefix("/frontend").Subrouter()
|
||||
metricstoreapi := s.router.PathPrefix("/metricstore").Subrouter()
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
// Create login failure handler (used by both /login and /jwt-login)
|
||||
loginFailureHandler := func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Login failed - ClusterCockpit",
|
||||
MsgType: "alert-warning",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}
|
||||
|
||||
s.router.Handle("/login", authHandle.Login(loginFailureHandler)).Methods(http.MethodPost)
|
||||
s.router.Handle("/jwt-login", authHandle.Login(loginFailureHandler))
|
||||
|
||||
s.router.Handle("/logout", authHandle.Logout(
|
||||
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Bye - ClusterCockpit",
|
||||
MsgType: "alert-info",
|
||||
Message: "Logout successful",
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))).Methods(http.MethodPost)
|
||||
|
||||
secured.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.Auth(
|
||||
// On success;
|
||||
next,
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Authentication failed - ClusterCockpit",
|
||||
MsgType: "alert-danger",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
Redirect: r.RequestURI,
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
securedapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
userapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthUserAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
metricstoreapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthMetricStoreAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
configapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthConfigAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
frontendapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthFrontendAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
}
|
||||
|
||||
if flagDev {
|
||||
s.router.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
|
||||
s.router.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
|
||||
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
|
||||
}
|
||||
secured.Handle("/query", graphQLServer)
|
||||
|
||||
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
|
||||
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
|
||||
routerConfig.HandleSearchBar(rw, r, buildInfo)
|
||||
})
|
||||
|
||||
// Mount all /monitoring/... and /api/... routes.
|
||||
routerConfig.SetupRoutes(secured, buildInfo)
|
||||
s.restAPIHandle.MountAPIRoutes(securedapi)
|
||||
s.restAPIHandle.MountUserAPIRoutes(userapi)
|
||||
s.restAPIHandle.MountConfigAPIRoutes(configapi)
|
||||
s.restAPIHandle.MountFrontendAPIRoutes(frontendapi)
|
||||
|
||||
if config.Keys.APISubjects != nil {
|
||||
s.natsAPIHandle = api.NewNatsAPI()
|
||||
if err := s.natsAPIHandle.StartSubscriptions(); err != nil {
|
||||
return fmt.Errorf("starting NATS subscriptions: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi)
|
||||
|
||||
if config.Keys.EmbedStaticFiles {
|
||||
if i, err := os.Stat("./var/img"); err == nil {
|
||||
if i.IsDir() {
|
||||
cclog.Info("Use local directory for static images")
|
||||
s.router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
|
||||
}
|
||||
}
|
||||
s.router.PathPrefix("/").Handler(http.StripPrefix("/", web.ServeFiles()))
|
||||
} else {
|
||||
s.router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
|
||||
}
|
||||
|
||||
s.router.Use(handlers.CompressHandler)
|
||||
s.router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
|
||||
s.router.Use(handlers.CORS(
|
||||
handlers.AllowCredentials(),
|
||||
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
|
||||
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
|
||||
handlers.AllowedOrigins([]string{"*"})))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Server timeout defaults (in seconds)
|
||||
const (
|
||||
defaultReadTimeout = 20
|
||||
defaultWriteTimeout = 20
|
||||
)
|
||||
|
||||
func (s *Server) Start(ctx context.Context) error {
|
||||
handler := handlers.CustomLoggingHandler(io.Discard, s.router, func(_ io.Writer, params handlers.LogFormatterParams) {
|
||||
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
|
||||
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
} else {
|
||||
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
}
|
||||
})
|
||||
|
||||
// Use configurable timeouts with defaults
|
||||
readTimeout := time.Duration(defaultReadTimeout) * time.Second
|
||||
writeTimeout := time.Duration(defaultWriteTimeout) * time.Second
|
||||
|
||||
s.server = &http.Server{
|
||||
ReadTimeout: readTimeout,
|
||||
WriteTimeout: writeTimeout,
|
||||
Handler: handler,
|
||||
Addr: config.Keys.Addr,
|
||||
}
|
||||
|
||||
// Start http or https server
|
||||
listener, err := net.Listen("tcp", config.Keys.Addr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting listener on '%s': %w", config.Keys.Addr, err)
|
||||
}
|
||||
|
||||
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHTTPTo != "" {
|
||||
go func() {
|
||||
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently))
|
||||
}()
|
||||
}
|
||||
|
||||
if config.Keys.HTTPSCertFile != "" && config.Keys.HTTPSKeyFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(
|
||||
config.Keys.HTTPSCertFile, config.Keys.HTTPSKeyFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("loading X509 keypair (check 'https-cert-file' and 'https-key-file' in config.json): %w", err)
|
||||
}
|
||||
listener = tls.NewListener(listener, &tls.Config{
|
||||
Certificates: []tls.Certificate{cert},
|
||||
CipherSuites: []uint16{
|
||||
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
|
||||
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
|
||||
},
|
||||
MinVersion: tls.VersionTLS12,
|
||||
PreferServerCipherSuites: true,
|
||||
})
|
||||
cclog.Infof("HTTPS server listening at %s...", config.Keys.Addr)
|
||||
} else {
|
||||
cclog.Infof("HTTP server listening at %s...", config.Keys.Addr)
|
||||
}
|
||||
//
|
||||
// Because this program will want to bind to a privileged port (like 80), the listener must
|
||||
// be established first, then the user can be changed, and after that,
|
||||
// the actual http server can be started.
|
||||
if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
|
||||
return fmt.Errorf("dropping privileges: %w", err)
|
||||
}
|
||||
|
||||
// Handle context cancellation for graceful shutdown
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
if err := s.server.Shutdown(shutdownCtx); err != nil {
|
||||
cclog.Errorf("Server shutdown error: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
if err = s.server.Serve(listener); err != nil && err != http.ErrServerClosed {
|
||||
return fmt.Errorf("server failed: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Server) Shutdown(ctx context.Context) {
|
||||
// Create a shutdown context with timeout
|
||||
shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
nc := nats.GetClient()
|
||||
if nc != nil {
|
||||
nc.Close()
|
||||
}
|
||||
|
||||
// First shut down the server gracefully (waiting for all ongoing requests)
|
||||
if err := s.server.Shutdown(shutdownCtx); err != nil {
|
||||
cclog.Errorf("Server shutdown error: %v", err)
|
||||
}
|
||||
|
||||
// Archive all the metric store data
|
||||
memorystore.Shutdown()
|
||||
|
||||
// Shutdown archiver with 10 second timeout for fast shutdown
|
||||
if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||
cclog.Warnf("Archiver shutdown: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -1,56 +1,96 @@
|
||||
{
|
||||
"main": {
|
||||
"addr": "127.0.0.1:8080",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
"short-running-jobs-duration": 300,
|
||||
"resampling": {
|
||||
"minimumPoints": 600,
|
||||
"trigger": 180,
|
||||
"resolutions": [
|
||||
240,
|
||||
60
|
||||
]
|
||||
},
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
],
|
||||
"emission-constant": 317
|
||||
},
|
||||
"cron": {
|
||||
"commit-job-worker": "2m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"max-age": "2000h"
|
||||
}
|
||||
},
|
||||
"nats": {
|
||||
"address": "nats://0.0.0.0:4222",
|
||||
"username": "root",
|
||||
"password": "root"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metric-store": {
|
||||
"checkpoints": {
|
||||
"file-format": "avro",
|
||||
"interval": "1h",
|
||||
"directory": "./var/checkpoints",
|
||||
"restore": "48h"
|
||||
},
|
||||
"archive": {
|
||||
"interval": "1h",
|
||||
"directory": "./var/archive"
|
||||
},
|
||||
"retention-in-memory": "48h",
|
||||
"subscriptions": [
|
||||
{
|
||||
"subscribe-to": "hpc-nats",
|
||||
"cluster-tag": "fritz"
|
||||
},
|
||||
{
|
||||
"subscribe-to": "hpc-nats",
|
||||
"cluster-tag": "alex"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,50 +1,49 @@
|
||||
{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:443",
|
||||
"ldap": {
|
||||
"url": "ldaps://test",
|
||||
"user_base": "ou=people,ou=hpc,dc=test,dc=de",
|
||||
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=test,dc=de",
|
||||
"user_bind": "uid={username},ou=people,ou=hpc,dc=test,dc=de",
|
||||
"user_filter": "(&(objectclass=posixAccount))"
|
||||
},
|
||||
"https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem",
|
||||
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
|
||||
"user": "clustercockpit",
|
||||
"group": "clustercockpit",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"validate": true,
|
||||
"clusters": [
|
||||
{
|
||||
"name": "test",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "eyJhbGciOiJF-E-pQBQ"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"validate": false,
|
||||
"apiAllowedIPs": ["*"],
|
||||
"short-running-jobs-duration": 300,
|
||||
"resampling": {
|
||||
"minimumPoints": 600,
|
||||
"trigger": 180,
|
||||
"resolutions": [
|
||||
240,
|
||||
60
|
||||
]
|
||||
}
|
||||
},
|
||||
"cron": {
|
||||
"commit-job-worker": "2m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "test",
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
],
|
||||
"jwts": {
|
||||
"cookieName": "",
|
||||
"validateUser": false,
|
||||
"max-age": "2000h",
|
||||
"trustedIssuer": ""
|
||||
},
|
||||
"short-running-jobs-duration": 300
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@@ -117,10 +117,12 @@ foreach my $ln (split("\n", $topo)) {
|
||||
|
||||
my $node;
|
||||
my @sockets;
|
||||
my @nodeCores;
|
||||
foreach my $socket ( @{$DOMAINS{socket}} ) {
|
||||
push @sockets, "[".join(",", @{$socket})."]";
|
||||
$node .= join(",", @{$socket})
|
||||
push @nodeCores, join(",", @{$socket});
|
||||
}
|
||||
$node = join(",", @nodeCores);
|
||||
$INFO{sockets} = join(",\n", @sockets);
|
||||
|
||||
my @memDomains;
|
||||
@@ -212,9 +214,27 @@ print <<"END";
|
||||
"socketsPerNode": $INFO{socketsPerNode},
|
||||
"coresPerSocket": $INFO{coresPerSocket},
|
||||
"threadsPerCore": $INFO{threadsPerCore},
|
||||
"flopRateScalar": $flopsScalar,
|
||||
"flopRateSimd": $flopsSimd,
|
||||
"memoryBandwidth": $memBw,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": $flopsScalar
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": $flopsSimd
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": $memBw
|
||||
},
|
||||
"nodes": "<FILL IN NODE RANGES>",
|
||||
"topology": {
|
||||
"node": [$node],
|
||||
|
||||
22
configs/startJobPayload.json
Normal file
22
configs/startJobPayload.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"cluster": "fritz",
|
||||
"jobId": 123000,
|
||||
"jobState": "running",
|
||||
"numAcc": 0,
|
||||
"numHwthreads": 72,
|
||||
"numNodes": 1,
|
||||
"partition": "main",
|
||||
"requestedMemory": 128000,
|
||||
"resources": [{ "hostname": "f0726" }],
|
||||
"startTime": 1649723812,
|
||||
"subCluster": "main",
|
||||
"submitTime": 1649723812,
|
||||
"user": "k106eb10",
|
||||
"project": "k106eb",
|
||||
"walltime": 86400,
|
||||
"metaData": {
|
||||
"slurmInfo": "JobId=398759\nJobName=myJob\nUserId=dummyUser\nGroupId=dummyGroup\nAccount=dummyAccount\nQOS=normal Requeue=False Restarts=0 BatchFlag=True\nTimeLimit=1439'\nSubmitTime=2023-02-09T14:10:18\nPartition=singlenode\nNodeList=xx\nNumNodes=xx NumCPUs=72 NumTasks=72 CPUs/Task=1\nNTasksPerNode:Socket:Core=0:None:None\nTRES_req=cpu=72,mem=250000M,node=1,billing=72\nTRES_alloc=cpu=72,node=1,billing=72\nCommand=myCmd\nWorkDir=myDir\nStdErr=\nStdOut=\n",
|
||||
"jobScript": "#!/bin/bash -l\n#SBATCH --job-name=dummy_job\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/dummy/\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\n\n#This is a dummy job script\n./mybinary\n",
|
||||
"jobName": "ams_pipeline"
|
||||
}
|
||||
}
|
||||
7
configs/stopJobPayload.json
Normal file
7
configs/stopJobPayload.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"cluster": "fritz",
|
||||
"jobId": 123000,
|
||||
"jobState": "completed",
|
||||
"startTime": 1649723812,
|
||||
"stopTime": 1649763839
|
||||
}
|
||||
45
configs/uiConfig.json
Normal file
45
configs/uiConfig.json
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"jobList": {
|
||||
"usePaging": false,
|
||||
"showFootprint":false
|
||||
},
|
||||
"jobView": {
|
||||
"showPolarPlot": true,
|
||||
"showFootprint": true,
|
||||
"showRoofline": true,
|
||||
"showStatTable": true
|
||||
},
|
||||
"metricConfig": {
|
||||
"jobListMetrics": ["mem_bw", "flops_dp"],
|
||||
"jobViewPlotMetrics": ["mem_bw", "flops_dp"],
|
||||
"jobViewTableMetrics": ["mem_bw", "flops_dp"],
|
||||
"clusters": [
|
||||
{
|
||||
"name": "test",
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "one",
|
||||
"jobListMetrics": ["mem_used", "flops_sp"]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"nodeList": {
|
||||
"usePaging": true
|
||||
},
|
||||
"plotConfiguration": {
|
||||
"plotsPerRow": 3,
|
||||
"colorBackground": true,
|
||||
"lineWidth": 3,
|
||||
"colorScheme": [
|
||||
"#00bfff",
|
||||
"#0000ff",
|
||||
"#ff00ff",
|
||||
"#ff0000",
|
||||
"#ff8000",
|
||||
"#ffff00",
|
||||
"#80ff00"
|
||||
]
|
||||
}
|
||||
}
|
||||
170
go.mod
170
go.mod
@@ -1,92 +1,126 @@
|
||||
module github.com/ClusterCockpit/cc-backend
|
||||
|
||||
go 1.18
|
||||
go 1.24.0
|
||||
|
||||
toolchain go1.24.1
|
||||
|
||||
tool (
|
||||
github.com/99designs/gqlgen
|
||||
github.com/swaggo/swag/cmd/swag
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/99designs/gqlgen v0.17.45
|
||||
github.com/ClusterCockpit/cc-units v0.4.0
|
||||
github.com/Masterminds/squirrel v1.5.3
|
||||
github.com/coreos/go-oidc/v3 v3.9.0
|
||||
github.com/go-co-op/gocron v1.25.0
|
||||
github.com/go-ldap/ldap/v3 v3.4.4
|
||||
github.com/go-sql-driver/mysql v1.7.0
|
||||
github.com/golang-jwt/jwt/v5 v5.2.1
|
||||
github.com/golang-migrate/migrate/v4 v4.15.2
|
||||
github.com/google/gops v0.3.27
|
||||
github.com/gorilla/handlers v1.5.1
|
||||
github.com/gorilla/mux v1.8.0
|
||||
github.com/gorilla/sessions v1.2.1
|
||||
github.com/influxdata/influxdb-client-go/v2 v2.12.2
|
||||
github.com/jmoiron/sqlx v1.3.5
|
||||
github.com/mattn/go-sqlite3 v1.14.16
|
||||
github.com/prometheus/client_golang v1.14.0
|
||||
github.com/prometheus/common v0.40.0
|
||||
github.com/99designs/gqlgen v0.17.84
|
||||
github.com/ClusterCockpit/cc-lib v1.0.2
|
||||
github.com/Masterminds/squirrel v1.5.4
|
||||
github.com/aws/aws-sdk-go-v2 v1.41.0
|
||||
github.com/aws/aws-sdk-go-v2/config v1.31.20
|
||||
github.com/aws/aws-sdk-go-v2/credentials v1.18.24
|
||||
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2
|
||||
github.com/coreos/go-oidc/v3 v3.16.0
|
||||
github.com/expr-lang/expr v1.17.6
|
||||
github.com/go-co-op/gocron/v2 v2.18.2
|
||||
github.com/go-ldap/ldap/v3 v3.4.12
|
||||
github.com/golang-jwt/jwt/v5 v5.3.0
|
||||
github.com/golang-migrate/migrate/v4 v4.19.1
|
||||
github.com/google/gops v0.3.28
|
||||
github.com/gorilla/handlers v1.5.2
|
||||
github.com/gorilla/mux v1.8.1
|
||||
github.com/gorilla/sessions v1.4.0
|
||||
github.com/influxdata/line-protocol/v2 v2.2.1
|
||||
github.com/jmoiron/sqlx v1.4.0
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/linkedin/goavro/v2 v2.14.1
|
||||
github.com/mattn/go-sqlite3 v1.14.32
|
||||
github.com/nats-io/nats.go v1.47.0
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/prometheus/common v0.67.4
|
||||
github.com/qustavo/sqlhooks/v2 v2.1.0
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.2.0
|
||||
github.com/swaggo/http-swagger v1.3.3
|
||||
github.com/swaggo/swag v1.16.3
|
||||
github.com/vektah/gqlparser/v2 v2.5.11
|
||||
golang.org/x/crypto v0.21.0
|
||||
golang.org/x/exp v0.0.0-20230510235704-dd950f8aeaea
|
||||
golang.org/x/oauth2 v0.13.0
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
|
||||
github.com/stretchr/testify v1.11.1
|
||||
github.com/swaggo/http-swagger v1.3.4
|
||||
github.com/swaggo/swag v1.16.6
|
||||
github.com/vektah/gqlparser/v2 v2.5.31
|
||||
golang.org/x/crypto v0.45.0
|
||||
golang.org/x/oauth2 v0.32.0
|
||||
golang.org/x/time v0.14.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
|
||||
github.com/KyleBanks/depth v1.2.1 // indirect
|
||||
github.com/agnivade/levenshtein v1.1.1 // indirect
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||
github.com/agnivade/levenshtein v1.2.1 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 // indirect
|
||||
github.com/aws/smithy-go v1.24.0 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.2.0 // indirect
|
||||
github.com/containerd/containerd v1.6.26 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
|
||||
github.com/deepmap/oapi-codegen v1.12.4 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.3 // indirect
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.4 // indirect
|
||||
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.21.0 // indirect
|
||||
github.com/go-openapi/jsonreference v0.21.0 // indirect
|
||||
github.com/go-openapi/spec v0.21.0 // indirect
|
||||
github.com/go-openapi/swag v0.23.0 // indirect
|
||||
github.com/golang/protobuf v1.5.3 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/fsnotify/fsnotify v1.9.0 // indirect
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 // indirect
|
||||
github.com/go-jose/go-jose/v4 v4.1.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.22.3 // indirect
|
||||
github.com/go-openapi/jsonreference v0.21.3 // indirect
|
||||
github.com/go-openapi/spec v0.22.1 // indirect
|
||||
github.com/go-openapi/swag/conv v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/jsonname v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/loading v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/stringutils v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/typeutils v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
|
||||
github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
|
||||
github.com/goccy/go-yaml v1.19.0 // indirect
|
||||
github.com/golang/snappy v0.0.4 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/gorilla/securecookie v1.1.1 // indirect
|
||||
github.com/gorilla/websocket v1.5.0 // indirect
|
||||
github.com/hashicorp/errwrap v1.1.0 // indirect
|
||||
github.com/hashicorp/go-multierror v1.1.1 // indirect
|
||||
github.com/gorilla/securecookie v1.1.2 // indirect
|
||||
github.com/gorilla/websocket v1.5.3 // indirect
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
|
||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/jonboulle/clockwork v0.5.0 // indirect
|
||||
github.com/jpillora/backoff v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/compress v1.18.1 // indirect
|
||||
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
|
||||
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
|
||||
github.com/opencontainers/image-spec v1.1.0-rc2.0.20221005185240-3a7f492d3f1b // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/prometheus/client_model v0.3.0 // indirect
|
||||
github.com/prometheus/procfs v0.9.0 // indirect
|
||||
github.com/nats-io/nkeys v0.4.11 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/procfs v0.16.1 // indirect
|
||||
github.com/robfig/cron/v3 v3.0.1 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/sosodev/duration v1.2.0 // indirect
|
||||
github.com/swaggo/files v1.0.0 // indirect
|
||||
github.com/urfave/cli/v2 v2.27.1 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913 // indirect
|
||||
go.uber.org/atomic v1.10.0 // indirect
|
||||
golang.org/x/mod v0.16.0 // indirect
|
||||
golang.org/x/net v0.22.0 // indirect
|
||||
golang.org/x/sys v0.18.0 // indirect
|
||||
golang.org/x/text v0.14.0 // indirect
|
||||
golang.org/x/tools v0.19.0 // indirect
|
||||
google.golang.org/appengine v1.6.8 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 // indirect
|
||||
google.golang.org/protobuf v1.33.0 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
github.com/sosodev/duration v1.3.1 // indirect
|
||||
github.com/stretchr/objx v0.5.2 // indirect
|
||||
github.com/swaggo/files v1.0.1 // indirect
|
||||
github.com/urfave/cli/v2 v2.27.7 // indirect
|
||||
github.com/urfave/cli/v3 v3.6.1 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.3 // indirect
|
||||
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
||||
golang.org/x/mod v0.30.0 // indirect
|
||||
golang.org/x/net v0.47.0 // indirect
|
||||
golang.org/x/sync v0.18.0 // indirect
|
||||
golang.org/x/sys v0.38.0 // indirect
|
||||
golang.org/x/text v0.31.0 // indirect
|
||||
golang.org/x/tools v0.39.0 // indirect
|
||||
google.golang.org/protobuf v1.36.10 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
sigs.k8s.io/yaml v1.4.0 // indirect
|
||||
sigs.k8s.io/yaml v1.6.0 // indirect
|
||||
)
|
||||
|
||||
63
gqlgen.yml
63
gqlgen.yml
@@ -30,7 +30,9 @@ resolver:
|
||||
# gqlgen will search for any type names in the schema in these go packages
|
||||
# if they match it will use them, otherwise it will generate them.
|
||||
autobind:
|
||||
- "github.com/99designs/gqlgen/graphql/introspection"
|
||||
- "github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
- "github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
|
||||
# This section declares type mapping between the GraphQL and go type systems
|
||||
#
|
||||
@@ -50,34 +52,51 @@ models:
|
||||
- github.com/99designs/gqlgen/graphql.Int64
|
||||
- github.com/99designs/gqlgen/graphql.Int32
|
||||
Job:
|
||||
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Job"
|
||||
model: "github.com/ClusterCockpit/cc-lib/schema.Job"
|
||||
fields:
|
||||
tags:
|
||||
resolver: true
|
||||
metaData:
|
||||
resolver: true
|
||||
Cluster:
|
||||
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Cluster"
|
||||
model: "github.com/ClusterCockpit/cc-lib/schema.Cluster"
|
||||
fields:
|
||||
partitions:
|
||||
resolver: true
|
||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
|
||||
MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
|
||||
MetricValue: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
|
||||
JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
|
||||
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
|
||||
Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
|
||||
JobState: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
|
||||
TimeRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
|
||||
IntRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.IntRange" }
|
||||
JobMetric: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobMetric" }
|
||||
Series: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Series" }
|
||||
MetricStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricStatistics" }
|
||||
MetricConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricConfig" }
|
||||
SubClusterConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubClusterConfig" }
|
||||
Accelerator: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Accelerator" }
|
||||
Topology: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Topology" }
|
||||
FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
|
||||
SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
|
||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
|
||||
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }
|
||||
# Node:
|
||||
# model: "github.com/ClusterCockpit/cc-lib/schema.Node"
|
||||
# fields:
|
||||
# metaData:
|
||||
# resolver: true
|
||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/schema.Float" }
|
||||
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricScope" }
|
||||
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricValue" }
|
||||
JobStatistics:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.JobStatistics" }
|
||||
GlobalMetricListItem:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.GlobalMetricListItem" }
|
||||
ClusterSupport:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.ClusterSupport" }
|
||||
Tag: { model: "github.com/ClusterCockpit/cc-lib/schema.Tag" }
|
||||
Resource: { model: "github.com/ClusterCockpit/cc-lib/schema.Resource" }
|
||||
JobState: { model: "github.com/ClusterCockpit/cc-lib/schema.JobState" }
|
||||
Node: { model: "github.com/ClusterCockpit/cc-lib/schema.Node" }
|
||||
SchedulerState:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.SchedulerState" }
|
||||
HealthState:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.MonitoringState" }
|
||||
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/schema.JobMetric" }
|
||||
Series: { model: "github.com/ClusterCockpit/cc-lib/schema.Series" }
|
||||
MetricStatistics:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricStatistics" }
|
||||
MetricConfig:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricConfig" }
|
||||
SubClusterConfig:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.SubClusterConfig" }
|
||||
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/schema.Accelerator" }
|
||||
Topology: { model: "github.com/ClusterCockpit/cc-lib/schema.Topology" }
|
||||
FilterRanges:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.FilterRanges" }
|
||||
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/schema.SubCluster" }
|
||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/schema.StatsSeries" }
|
||||
Unit: { model: "github.com/ClusterCockpit/cc-lib/schema.Unit" }
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
[Unit]
|
||||
Description=ClusterCockpit Web Server (Go edition)
|
||||
Description=ClusterCockpit Web Server
|
||||
Documentation=https://github.com/ClusterCockpit/cc-backend
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
After=mariadb.service mysql.service
|
||||
# Database is file-based SQLite - no service dependency required
|
||||
|
||||
[Service]
|
||||
WorkingDirectory=/opt/monitoring/cc-backend
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package api_test
|
||||
@@ -14,38 +14,49 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
"sync"
|
||||
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
func setup(t *testing.T) *api.RestApi {
|
||||
func setup(t *testing.T) *api.RestAPI {
|
||||
const testconfig = `{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
}
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "testcluster",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 64 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
@@ -54,7 +65,7 @@ func setup(t *testing.T) *api.RestApi {
|
||||
}
|
||||
]
|
||||
}`
|
||||
const testclusterJson = `{
|
||||
const testclusterJSON = `{
|
||||
"name": "testcluster",
|
||||
"subClusters": [
|
||||
{
|
||||
@@ -110,97 +121,108 @@ func setup(t *testing.T) *api.RestApi {
|
||||
]
|
||||
}`
|
||||
|
||||
log.Init("info", true)
|
||||
cclog.Init("info", true)
|
||||
tmpdir := t.TempDir()
|
||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||
if err := os.Mkdir(jobarchive, 0777); err != nil {
|
||||
if err := os.Mkdir(jobarchive, 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0777); err != nil {
|
||||
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJson), 0666); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||
err := repository.MigrateDB("sqlite3", dbfilepath)
|
||||
err := repository.MigrateDB(dbfilepath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0666); err != nil {
|
||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
config.Init(cfgFilePath)
|
||||
ccconf.Init(cfgFilePath)
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
repository.Connect("sqlite3", dbfilepath)
|
||||
db := repository.GetConnection()
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := metricdata.Init(config.Keys.DisableArchive); err != nil {
|
||||
t.Fatal(err)
|
||||
// Initialize memorystore (optional - will return nil if not configured)
|
||||
// For this test, we don't initialize it to test the nil handling
|
||||
mscfg := ccconf.GetPackageConfig("metric-store")
|
||||
if mscfg != nil {
|
||||
var wg sync.WaitGroup
|
||||
memorystore.Init(mscfg, &wg)
|
||||
}
|
||||
|
||||
jobRepo := repository.GetJobRepository()
|
||||
resolver := &graph.Resolver{DB: db.DB, Repo: jobRepo}
|
||||
archiver.Start(repository.GetJobRepository(), context.Background())
|
||||
|
||||
return &api.RestApi{
|
||||
JobRepository: resolver.Repo,
|
||||
Resolver: resolver,
|
||||
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||
auth.Init(&cfg)
|
||||
} else {
|
||||
cclog.Warn("Authentication disabled due to missing configuration")
|
||||
auth.Init(nil)
|
||||
}
|
||||
|
||||
graph.Init()
|
||||
|
||||
return api.New()
|
||||
}
|
||||
|
||||
func cleanup() {
|
||||
// TODO: Clear all caches, reset all modules, etc...
|
||||
// Gracefully shutdown archiver with timeout
|
||||
if err := archiver.Shutdown(5 * time.Second); err != nil {
|
||||
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
|
||||
}
|
||||
|
||||
// Shutdown memorystore if it was initialized
|
||||
memorystore.Shutdown()
|
||||
}
|
||||
|
||||
/*
|
||||
* This function starts a job, stops it, and then reads its data from the job-archive.
|
||||
* Do not run sub-tests in parallel! Tests should not be run in parallel at all, because
|
||||
* at least `setup` modifies global state.
|
||||
* This function starts a job, stops it, and tests the REST API.
|
||||
* Do not run sub-tests in parallel! Tests should not be run in parallel at all, because
|
||||
* at least `setup` modifies global state.
|
||||
*/
|
||||
func TestRestApi(t *testing.T) {
|
||||
restapi := setup(t)
|
||||
t.Cleanup(cleanup)
|
||||
|
||||
testData := schema.JobData{
|
||||
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||
schema.MetricScopeNode: {
|
||||
Unit: schema.Unit{Base: "load"},
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "host123",
|
||||
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
|
||||
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
return testData, nil
|
||||
}
|
||||
|
||||
r := mux.NewRouter()
|
||||
restapi.MountRoutes(r)
|
||||
r.PathPrefix("/api").Subrouter()
|
||||
r.StrictSlash(true)
|
||||
restapi.MountAPIRoutes(r)
|
||||
|
||||
var TestJobId int64 = 123
|
||||
TestClusterName := "testcluster"
|
||||
var TestStartTime int64 = 123456789
|
||||
|
||||
const startJobBody string = `{
|
||||
"jobId": 123,
|
||||
"jobId": 123,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
@@ -210,10 +232,9 @@ func TestRestApi(t *testing.T) {
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"exclusive": 1,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"tags": [{ "type": "testTagType", "name": "testTagName" }],
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
@@ -224,28 +245,28 @@ func TestRestApi(t *testing.T) {
|
||||
"startTime": 123456789
|
||||
}`
|
||||
|
||||
var dbid int64
|
||||
const contextUserKey repository.ContextKey = "user"
|
||||
contextUserValue := &schema.User{
|
||||
Username: "testuser",
|
||||
Projects: make([]string, 0),
|
||||
Roles: []string{"user"},
|
||||
AuthType: 0,
|
||||
AuthSource: 2,
|
||||
}
|
||||
|
||||
if ok := t.Run("StartJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusCreated {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
var res api.StartJobApiResponse
|
||||
if err := json.Unmarshal(recorder.Body.Bytes(), &res); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(res.DBID)))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
job.Tags, err = restapi.Resolver.Job().Tags(context.Background(), job)
|
||||
restapi.JobRepository.SyncJobs()
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -257,23 +278,16 @@ func TestRestApi(t *testing.T) {
|
||||
job.SubCluster != "sc1" ||
|
||||
job.Partition != "default" ||
|
||||
job.Walltime != 3600 ||
|
||||
job.ArrayJobId != 0 ||
|
||||
job.ArrayJobID != 0 ||
|
||||
job.NumNodes != 1 ||
|
||||
job.NumHWThreads != 8 ||
|
||||
job.NumAcc != 0 ||
|
||||
job.Exclusive != 1 ||
|
||||
job.MonitoringStatus != 1 ||
|
||||
job.SMT != 1 ||
|
||||
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
|
||||
job.StartTime.Unix() != 123456789 {
|
||||
job.StartTime != 123456789 {
|
||||
t.Fatalf("unexpected job properties: %#v", job)
|
||||
}
|
||||
|
||||
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" {
|
||||
t.Fatalf("unexpected tags: %#v", job.Tags)
|
||||
}
|
||||
|
||||
dbid = res.DBID
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
@@ -287,19 +301,20 @@ func TestRestApi(t *testing.T) {
|
||||
"stopTime": 123457789
|
||||
}`
|
||||
|
||||
var stoppedJob *schema.Job
|
||||
if ok := t.Run("StopJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusOK {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
restapi.JobRepository.WaitForArchiving()
|
||||
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(dbid)))
|
||||
// Archiving happens asynchronously, will be completed in cleanup
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -321,30 +336,23 @@ func TestRestApi(t *testing.T) {
|
||||
t.Fatalf("unexpected job.metaData: %#v", job.MetaData)
|
||||
}
|
||||
|
||||
stoppedJob = job
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
|
||||
t.Run("CheckArchive", func(t *testing.T) {
|
||||
data, err := metricdata.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(data, testData) {
|
||||
t.Fatal("unexpected data fetched from archive")
|
||||
}
|
||||
})
|
||||
// Note: We skip the CheckArchive test because without memorystore initialized,
|
||||
// archiving will fail gracefully. This test now focuses on the REST API itself.
|
||||
|
||||
t.Run("CheckDoubleStart", func(t *testing.T) {
|
||||
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!
|
||||
body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1)
|
||||
body := strings.ReplaceAll(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`)
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(body)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusUnprocessableEntity {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
@@ -359,7 +367,7 @@ func TestRestApi(t *testing.T) {
|
||||
"partition": "default",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"exclusive": 1,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
@@ -371,10 +379,12 @@ func TestRestApi(t *testing.T) {
|
||||
}`
|
||||
|
||||
ok := t.Run("StartJobFailed", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusCreated {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
@@ -384,8 +394,11 @@ func TestRestApi(t *testing.T) {
|
||||
t.Fatal("subtest failed")
|
||||
}
|
||||
|
||||
time.Sleep(1 * time.Second)
|
||||
restapi.JobRepository.SyncJobs()
|
||||
|
||||
const stopJobBodyFailed string = `{
|
||||
"jobId": 12345,
|
||||
"jobId": 12345,
|
||||
"cluster": "testcluster",
|
||||
|
||||
"jobState": "failed",
|
||||
@@ -393,16 +406,18 @@ func TestRestApi(t *testing.T) {
|
||||
}`
|
||||
|
||||
ok = t.Run("StopJobFailed", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusOK {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
restapi.JobRepository.WaitForArchiving()
|
||||
// Archiving happens asynchronously, will be completed in cleanup
|
||||
jobid, cluster := int64(12345), "testcluster"
|
||||
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
|
||||
if err != nil {
|
||||
|
||||
71
internal/api/cluster.go
Normal file
71
internal/api/cluster.go
Normal file
@@ -0,0 +1,71 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// GetClustersAPIResponse model
|
||||
type GetClustersAPIResponse struct {
|
||||
Clusters []*schema.Cluster `json:"clusters"` // Array of clusters
|
||||
}
|
||||
|
||||
// getClusters godoc
|
||||
// @summary Lists all cluster configs
|
||||
// @tags Cluster query
|
||||
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
|
||||
// @produce json
|
||||
// @param cluster query string false "Job Cluster"
|
||||
// @success 200 {object} api.GetClustersApiResponse "Array of clusters"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/clusters/ [get]
|
||||
func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
bw := bufio.NewWriter(rw)
|
||||
defer bw.Flush()
|
||||
|
||||
var clusters []*schema.Cluster
|
||||
|
||||
if r.URL.Query().Has("cluster") {
|
||||
name := r.URL.Query().Get("cluster")
|
||||
cluster := archive.GetCluster(name)
|
||||
if cluster == nil {
|
||||
handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
clusters = append(clusters, cluster)
|
||||
} else {
|
||||
clusters = archive.Clusters
|
||||
}
|
||||
|
||||
payload := GetClustersAPIResponse{
|
||||
Clusters: clusters,
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(bw).Encode(payload); err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
1024
internal/api/job.go
Normal file
1024
internal/api/job.go
Normal file
File diff suppressed because it is too large
Load Diff
170
internal/api/memorystore.go
Normal file
170
internal/api/memorystore.go
Normal file
@@ -0,0 +1,170 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
|
||||
"github.com/influxdata/line-protocol/v2/lineprotocol"
|
||||
)
|
||||
|
||||
// handleFree godoc
|
||||
// @summary
|
||||
// @tags free
|
||||
// @description This endpoint allows the users to free the Buffers from the
|
||||
// metric store. This endpoint offers the users to remove then systematically
|
||||
// and also allows then to prune the data under node, if they do not want to
|
||||
// remove the whole node.
|
||||
// @produce json
|
||||
// @param to query string false "up to timestamp"
|
||||
// @success 200 {string} string "ok"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /free/ [post]
|
||||
func freeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
rawTo := r.URL.Query().Get("to")
|
||||
if rawTo == "" {
|
||||
handleError(errors.New("'to' is a required query parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
to, err := strconv.ParseInt(rawTo, 10, 64)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
bodyDec := json.NewDecoder(r.Body)
|
||||
var selectors [][]string
|
||||
err = bodyDec.Decode(&selectors)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
n := 0
|
||||
for _, sel := range selectors {
|
||||
bn, err := ms.Free(sel, to)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
n += bn
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
fmt.Fprintf(rw, "buffers freed: %d\n", n)
|
||||
}
|
||||
|
||||
// handleWrite godoc
|
||||
// @summary Receive metrics in InfluxDB line-protocol
|
||||
// @tags write
|
||||
// @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)
|
||||
|
||||
// @accept plain
|
||||
// @produce json
|
||||
// @param cluster query string false "If the lines in the body do not have a cluster tag, use this value instead."
|
||||
// @success 200 {string} string "ok"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /write/ [post]
|
||||
func writeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
bytes, err := io.ReadAll(r.Body)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
dec := lineprotocol.NewDecoderWithBytes(bytes)
|
||||
if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
|
||||
cclog.Errorf("/api/write error: %s", err.Error())
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
}
|
||||
|
||||
// handleDebug godoc
|
||||
// @summary Debug endpoint
|
||||
// @tags debug
|
||||
// @description This endpoint allows the users to print the content of
|
||||
// nodes/clusters/metrics to review the state of the data.
|
||||
// @produce json
|
||||
// @param selector query string false "Selector"
|
||||
// @success 200 {string} string "Debug dump"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /debug/ [post]
|
||||
func debugMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
raw := r.URL.Query().Get("selector")
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
selector := []string{}
|
||||
if len(raw) != 0 {
|
||||
selector = strings.Split(raw, ":")
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// handleHealthCheck godoc
|
||||
// @summary HealthCheck endpoint
|
||||
// @tags healthcheck
|
||||
// @description This endpoint allows the users to check if a node is healthy
|
||||
// @produce json
|
||||
// @param selector query string false "Selector"
|
||||
// @success 200 {string} string "Debug dump"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /healthcheck/ [get]
|
||||
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
|
||||
rawCluster := r.URL.Query().Get("cluster")
|
||||
rawNode := r.URL.Query().Get("node")
|
||||
|
||||
if rawCluster == "" || rawNode == "" {
|
||||
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
|
||||
selector := []string{rawCluster, rawNode}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
231
internal/api/nats.go
Normal file
231
internal/api/nats.go
Normal file
@@ -0,0 +1,231 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/nats"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// NatsAPI provides NATS subscription-based handlers for Job and Node operations.
|
||||
// It mirrors the functionality of the REST API but uses NATS messaging.
|
||||
type NatsAPI struct {
|
||||
JobRepository *repository.JobRepository
|
||||
// RepositoryMutex protects job creation operations from race conditions
|
||||
// when checking for duplicate jobs during startJob calls.
|
||||
RepositoryMutex sync.Mutex
|
||||
}
|
||||
|
||||
// NewNatsAPI creates a new NatsAPI instance with default dependencies.
|
||||
func NewNatsAPI() *NatsAPI {
|
||||
return &NatsAPI{
|
||||
JobRepository: repository.GetJobRepository(),
|
||||
}
|
||||
}
|
||||
|
||||
// StartSubscriptions registers all NATS subscriptions for Job and Node APIs.
|
||||
// Returns an error if the NATS client is not available or subscription fails.
|
||||
func (api *NatsAPI) StartSubscriptions() error {
|
||||
client := nats.GetClient()
|
||||
if client == nil {
|
||||
cclog.Warn("NATS client not available, skipping API subscriptions")
|
||||
return nil
|
||||
}
|
||||
|
||||
if config.Keys.APISubjects != nil {
|
||||
|
||||
s := config.Keys.APISubjects
|
||||
|
||||
if err := client.Subscribe(s.SubjectJobStart, api.handleStartJob); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := client.Subscribe(s.SubjectJobStop, api.handleStopJob); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := client.Subscribe(s.SubjectNodeState, api.handleNodeState); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cclog.Info("NATS API subscriptions started")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// handleStartJob processes job start messages received via NATS.
|
||||
// Expected JSON payload follows the schema.Job structure.
|
||||
func (api *NatsAPI) handleStartJob(subject string, data []byte) {
|
||||
req := schema.Job{
|
||||
Shared: "none",
|
||||
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
|
||||
}
|
||||
|
||||
dec := json.NewDecoder(bytes.NewReader(data))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&req); err != nil {
|
||||
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err)
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Debugf("NATS %s: %s", subject, req.GoString())
|
||||
req.State = schema.JobStateRunning
|
||||
|
||||
if err := importer.SanityChecks(&req); err != nil {
|
||||
cclog.Errorf("NATS %s: sanity check failed: %v", subject, err)
|
||||
return
|
||||
}
|
||||
|
||||
var unlockOnce sync.Once
|
||||
api.RepositoryMutex.Lock()
|
||||
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||
|
||||
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
cclog.Errorf("NATS %s: checking for duplicate failed: %v", subject, err)
|
||||
return
|
||||
}
|
||||
if err == nil {
|
||||
for _, job := range jobs {
|
||||
if (req.StartTime - job.StartTime) < secondsPerDay {
|
||||
cclog.Errorf("NATS %s: job with jobId %d, cluster %s already exists (dbid: %d)",
|
||||
subject, req.JobID, req.Cluster, job.ID)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
id, err := api.JobRepository.Start(&req)
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS %s: insert into database failed: %v", subject, err)
|
||||
return
|
||||
}
|
||||
unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||
|
||||
for _, tag := range req.Tags {
|
||||
if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||
cclog.Errorf("NATS %s: adding tag to new job %d failed: %v", subject, id, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Infof("NATS: new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
|
||||
id, req.Cluster, req.JobID, req.User, req.StartTime)
|
||||
}
|
||||
|
||||
// handleStopJob processes job stop messages received via NATS.
|
||||
// Expected JSON payload follows the StopJobAPIRequest structure.
|
||||
func (api *NatsAPI) handleStopJob(subject string, data []byte) {
|
||||
var req StopJobAPIRequest
|
||||
|
||||
dec := json.NewDecoder(bytes.NewReader(data))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&req); err != nil {
|
||||
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err)
|
||||
return
|
||||
}
|
||||
|
||||
if req.JobID == nil {
|
||||
cclog.Errorf("NATS %s: the field 'jobId' is required", subject)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
|
||||
if cachedErr != nil {
|
||||
cclog.Errorf("NATS %s: finding job failed: %v (cached lookup also failed: %v)",
|
||||
subject, err, cachedErr)
|
||||
return
|
||||
}
|
||||
job = cachedJob
|
||||
}
|
||||
|
||||
if job.State != schema.JobStateRunning {
|
||||
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
|
||||
subject, job.JobID, job.ID, job.Cluster, job.State)
|
||||
return
|
||||
}
|
||||
|
||||
if job.StartTime > req.StopTime {
|
||||
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
|
||||
subject, job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
|
||||
return
|
||||
}
|
||||
|
||||
if req.State != "" && !req.State.Valid() {
|
||||
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: invalid job state: %#v",
|
||||
subject, job.JobID, job.ID, job.Cluster, req.State)
|
||||
return
|
||||
} else if req.State == "" {
|
||||
req.State = schema.JobStateCompleted
|
||||
}
|
||||
|
||||
job.Duration = int32(req.StopTime - job.StartTime)
|
||||
job.State = req.State
|
||||
api.JobRepository.Mutex.Lock()
|
||||
defer api.JobRepository.Mutex.Unlock()
|
||||
|
||||
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
cclog.Errorf("NATS %s: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
|
||||
subject, job.JobID, job.ID, job.Cluster, job.State, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s",
|
||||
job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||
|
||||
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
|
||||
return
|
||||
}
|
||||
|
||||
archiver.TriggerArchiving(job)
|
||||
}
|
||||
|
||||
// handleNodeState processes node state update messages received via NATS.
|
||||
// Expected JSON payload follows the UpdateNodeStatesRequest structure.
|
||||
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
|
||||
var req UpdateNodeStatesRequest
|
||||
|
||||
dec := json.NewDecoder(bytes.NewReader(data))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&req); err != nil {
|
||||
cclog.Errorf("NATS %s: parsing request failed: %v", subject, err)
|
||||
return
|
||||
}
|
||||
|
||||
repo := repository.GetNodeRepository()
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: time.Now().Unix(),
|
||||
NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
GpusAllocated: node.GpusAllocated,
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
JobsRunning: node.JobsRunning,
|
||||
}
|
||||
|
||||
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
|
||||
}
|
||||
|
||||
cclog.Debugf("NATS %s: updated %d node states for cluster %s", subject, len(req.Nodes), req.Cluster)
|
||||
}
|
||||
80
internal/api/node.go
Normal file
80
internal/api/node.go
Normal file
@@ -0,0 +1,80 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
type UpdateNodeStatesRequest struct {
|
||||
Nodes []schema.NodePayload `json:"nodes"`
|
||||
Cluster string `json:"cluster" example:"fritz"`
|
||||
}
|
||||
|
||||
// this routine assumes that only one of them exists per node
|
||||
func determineState(states []string) schema.SchedulerState {
|
||||
for _, state := range states {
|
||||
switch strings.ToLower(state) {
|
||||
case "allocated":
|
||||
return schema.NodeStateAllocated
|
||||
case "reserved":
|
||||
return schema.NodeStateReserved
|
||||
case "idle":
|
||||
return schema.NodeStateIdle
|
||||
case "down":
|
||||
return schema.NodeStateDown
|
||||
case "mixed":
|
||||
return schema.NodeStateMixed
|
||||
}
|
||||
}
|
||||
|
||||
return schema.NodeStateUnknown
|
||||
}
|
||||
|
||||
// updateNodeStates godoc
|
||||
// @summary Deliver updated Slurm node states
|
||||
// @tags Nodestates
|
||||
// @description Returns a JSON-encoded list of users.
|
||||
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||
// @produce json
|
||||
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/nodestats/ [post]
|
||||
func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
||||
// Parse request body
|
||||
req := UpdateNodeStatesRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err),
|
||||
http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
repo := repository.GetNodeRepository()
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: time.Now().Unix(), NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
GpusAllocated: node.GpusAllocated,
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
JobsRunning: node.JobsRunning,
|
||||
}
|
||||
|
||||
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
|
||||
}
|
||||
}
|
||||
1467
internal/api/rest.go
1467
internal/api/rest.go
File diff suppressed because it is too large
Load Diff
221
internal/api/user.go
Normal file
221
internal/api/user.go
Normal file
@@ -0,0 +1,221 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
type APIReturnedUser struct {
|
||||
Username string `json:"username"`
|
||||
Name string `json:"name"`
|
||||
Roles []string `json:"roles"`
|
||||
Email string `json:"email"`
|
||||
Projects []string `json:"projects"`
|
||||
}
|
||||
|
||||
// getUsers godoc
|
||||
// @summary Returns a list of users
|
||||
// @tags User
|
||||
// @description Returns a JSON-encoded list of users.
|
||||
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||
// @produce json
|
||||
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
|
||||
// @success 200 {array} api.ApiReturnedUser "List of users returned successfully"
|
||||
// @failure 400 {string} string "Bad Request"
|
||||
// @failure 401 {string} string "Unauthorized"
|
||||
// @failure 403 {string} string "Forbidden"
|
||||
// @failure 500 {string} string "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/users/ [get]
|
||||
func (api *RestAPI) getUsers(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to fetch a list of users"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true")
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("listing users failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(rw).Encode(users); err != nil {
|
||||
cclog.Errorf("Failed to encode users response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// updateUser godoc
|
||||
// @summary Update user roles and projects
|
||||
// @tags User
|
||||
// @description Allows admins to add/remove roles and projects for a user
|
||||
// @produce plain
|
||||
// @param id path string true "Username"
|
||||
// @param add-role formData string false "Role to add"
|
||||
// @param remove-role formData string false "Role to remove"
|
||||
// @param add-project formData string false "Project to add"
|
||||
// @param remove-project formData string false "Project to remove"
|
||||
// @success 200 {string} string "Success message"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/user/{id} [post]
|
||||
func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to update a user"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Get Values
|
||||
newrole := r.FormValue("add-role")
|
||||
delrole := r.FormValue("remove-role")
|
||||
newproj := r.FormValue("add-project")
|
||||
delproj := r.FormValue("remove-project")
|
||||
|
||||
rw.Header().Set("Content-Type", "application/json")
|
||||
|
||||
// Handle role updates
|
||||
if newrole != "" {
|
||||
if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
|
||||
handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Add Role Success"}); err != nil {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else if delrole != "" {
|
||||
if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil {
|
||||
handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Remove Role Success"}); err != nil {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else if newproj != "" {
|
||||
if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil {
|
||||
handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Add Project Success"}); err != nil {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else if delproj != "" {
|
||||
if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil {
|
||||
handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{Message: "Remove Project Success"}); err != nil {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else {
|
||||
handleError(fmt.Errorf("no operation specified: must provide add-role, remove-role, add-project, or remove-project"), http.StatusBadRequest, rw)
|
||||
}
|
||||
}
|
||||
|
||||
// createUser godoc
|
||||
// @summary Create a new user
|
||||
// @tags User
|
||||
// @description Creates a new user with specified credentials and role
|
||||
// @produce plain
|
||||
// @param username formData string true "Username"
|
||||
// @param password formData string false "Password (not required for API users)"
|
||||
// @param role formData string true "User role"
|
||||
// @param name formData string false "Full name"
|
||||
// @param email formData string false "Email address"
|
||||
// @param project formData string false "Project (required for managers)"
|
||||
// @success 200 {string} string "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/users/ [post]
|
||||
func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
me := repository.GetUserFromContext(r.Context())
|
||||
if !me.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to create new users"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
username, password, role, name, email, project := r.FormValue("username"),
|
||||
r.FormValue("password"), r.FormValue("role"), r.FormValue("name"),
|
||||
r.FormValue("email"), r.FormValue("project")
|
||||
|
||||
// Validate username length
|
||||
if len(username) == 0 || len(username) > 100 {
|
||||
handleError(fmt.Errorf("username must be between 1 and 100 characters"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) {
|
||||
handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) {
|
||||
handleError(fmt.Errorf("only managers require a project (can be changed later)"), http.StatusBadRequest, rw)
|
||||
return
|
||||
} else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) {
|
||||
handleError(fmt.Errorf("managers require a project to manage (can be changed later)"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if err := repository.GetUserRepository().AddUser(&schema.User{
|
||||
Username: username,
|
||||
Name: name,
|
||||
Password: password,
|
||||
Email: email,
|
||||
Projects: []string{project},
|
||||
Roles: []string{role},
|
||||
}); err != nil {
|
||||
handleError(fmt.Errorf("adding user failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Fprintf(rw, "User %v successfully created!\n", username)
|
||||
}
|
||||
|
||||
// deleteUser godoc
|
||||
// @summary Delete a user
|
||||
// @tags User
|
||||
// @description Deletes a user from the system
|
||||
// @produce plain
|
||||
// @param username formData string true "Username to delete"
|
||||
// @success 200 {string} string "Success"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/users/ [delete]
|
||||
func (api *RestAPI) deleteUser(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to delete a user"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
username := r.FormValue("username")
|
||||
if err := repository.GetUserRepository().DelUser(username); err != nil {
|
||||
handleError(fmt.Errorf("deleting user failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
}
|
||||
190
internal/archiver/README.md
Normal file
190
internal/archiver/README.md
Normal file
@@ -0,0 +1,190 @@
|
||||
# Archiver Package
|
||||
|
||||
The `archiver` package provides asynchronous job archiving functionality for ClusterCockpit. When jobs complete, their metric data is archived from the metric store to a persistent archive backend (filesystem, S3, SQLite, etc.).
|
||||
|
||||
## Architecture
|
||||
|
||||
### Producer-Consumer Pattern
|
||||
|
||||
```
|
||||
┌──────────────┐ TriggerArchiving() ┌───────────────┐
|
||||
│ API Handler │ ───────────────────────▶ │ archiveChannel│
|
||||
│ (Job Stop) │ │ (buffer: 128)│
|
||||
└──────────────┘ └───────┬───────┘
|
||||
│
|
||||
┌─────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────┐
|
||||
│ archivingWorker() │
|
||||
│ (goroutine) │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
1. Fetch job metadata
|
||||
2. Load metric data
|
||||
3. Calculate statistics
|
||||
4. Archive to backend
|
||||
5. Update database
|
||||
6. Call hooks
|
||||
```
|
||||
|
||||
### Components
|
||||
|
||||
- **archiveChannel**: Buffered channel (128 jobs) for async communication
|
||||
- **archivePending**: WaitGroup tracking in-flight archiving operations
|
||||
- **archivingWorker**: Background goroutine processing archiving requests
|
||||
- **shutdownCtx**: Context for graceful cancellation during shutdown
|
||||
|
||||
## Usage
|
||||
|
||||
### Initialization
|
||||
|
||||
```go
|
||||
// Start archiver with context for shutdown control
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
archiver.Start(jobRepository, ctx)
|
||||
```
|
||||
|
||||
### Archiving a Job
|
||||
|
||||
```go
|
||||
// Called automatically when a job completes
|
||||
archiver.TriggerArchiving(job)
|
||||
```
|
||||
|
||||
The function returns immediately. Actual archiving happens in the background.
|
||||
|
||||
### Graceful Shutdown
|
||||
|
||||
```go
|
||||
// Shutdown with 10 second timeout
|
||||
if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||
log.Printf("Archiver shutdown timeout: %v", err)
|
||||
}
|
||||
```
|
||||
|
||||
**Shutdown process:**
|
||||
1. Closes channel (rejects new jobs)
|
||||
2. Waits for pending jobs (up to timeout)
|
||||
3. Cancels context if timeout exceeded
|
||||
4. Waits for worker to exit cleanly
|
||||
|
||||
## Configuration
|
||||
|
||||
### Channel Buffer Size
|
||||
|
||||
The archiving channel has a buffer of 128 jobs. If more than 128 jobs are queued simultaneously, `TriggerArchiving()` will block until space is available.
|
||||
|
||||
To adjust:
|
||||
```go
|
||||
// In archiveWorker.go Start() function
|
||||
archiveChannel = make(chan *schema.Job, 256) // Increase buffer
|
||||
```
|
||||
|
||||
### Scope Selection
|
||||
|
||||
Archive data scopes are automatically selected based on job size:
|
||||
|
||||
- **Node scope**: Always included
|
||||
- **Core scope**: Included for jobs with ≤8 nodes (reduces data volume for large jobs)
|
||||
- **Accelerator scope**: Included if job used accelerators (`NumAcc > 0`)
|
||||
|
||||
To adjust the node threshold:
|
||||
```go
|
||||
// In archiver.go ArchiveJob() function
|
||||
if job.NumNodes <= 16 { // Change from 8 to 16
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
}
|
||||
```
|
||||
|
||||
### Resolution
|
||||
|
||||
Data is archived at the highest available resolution (typically 60s intervals). To change:
|
||||
|
||||
```go
|
||||
// In archiver.go ArchiveJob() function
|
||||
jobData, err := metricdispatcher.LoadData(job, allMetrics, scopes, ctx, 300)
|
||||
// 0 = highest resolution
|
||||
// 300 = 5-minute resolution
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Automatic Retry
|
||||
|
||||
The archiver does **not** automatically retry failed archiving operations. If archiving fails:
|
||||
|
||||
1. Error is logged
|
||||
2. Job is marked as `MonitoringStatusArchivingFailed` in database
|
||||
3. Worker continues processing other jobs
|
||||
|
||||
### Manual Retry
|
||||
|
||||
To re-archive failed jobs, query for jobs with `MonitoringStatusArchivingFailed` and call `TriggerArchiving()` again.
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Single Worker Thread
|
||||
|
||||
The archiver uses a single worker goroutine. For high-throughput systems:
|
||||
|
||||
- Large channel buffer (128) prevents blocking
|
||||
- Archiving is typically I/O bound (writing to storage)
|
||||
- Single worker prevents overwhelming storage backend
|
||||
|
||||
### Shutdown Timeout
|
||||
|
||||
Recommended timeout values:
|
||||
- **Development**: 5-10 seconds
|
||||
- **Production**: 10-30 seconds
|
||||
- **High-load**: 30-60 seconds
|
||||
|
||||
Choose based on:
|
||||
- Average archiving time per job
|
||||
- Storage backend latency
|
||||
- Acceptable shutdown delay
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Logging
|
||||
|
||||
The archiver logs:
|
||||
- **Info**: Startup, shutdown, successful completions
|
||||
- **Debug**: Individual job archiving times
|
||||
- **Error**: Archiving failures with job ID and reason
|
||||
- **Warn**: Shutdown timeout exceeded
|
||||
|
||||
### Metrics
|
||||
|
||||
Monitor these signals for archiver health:
|
||||
- Jobs with `MonitoringStatusArchivingFailed`
|
||||
- Time from job stop to successful archive
|
||||
- Shutdown timeout occurrences
|
||||
|
||||
## Thread Safety
|
||||
|
||||
All exported functions are safe for concurrent use:
|
||||
- `Start()` - Safe to call once
|
||||
- `TriggerArchiving()` - Safe from multiple goroutines
|
||||
- `Shutdown()` - Safe to call once
|
||||
- `WaitForArchiving()` - Deprecated, but safe
|
||||
|
||||
Internal state is protected by:
|
||||
- Channel synchronization (`archiveChannel`)
|
||||
- WaitGroup for pending count (`archivePending`)
|
||||
- Context for cancellation (`shutdownCtx`)
|
||||
|
||||
## Files
|
||||
|
||||
- **archiveWorker.go**: Worker lifecycle, channel management, shutdown logic
|
||||
- **archiver.go**: Core archiving logic, metric loading, statistics calculation
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `internal/repository`: Database operations for job metadata
|
||||
- `internal/metricdispatcher`: Loading metric data from various backends
|
||||
- `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
|
||||
- `cc-lib/schema`: Job and metric data structures
|
||||
250
internal/archiver/archiveWorker.go
Normal file
250
internal/archiver/archiveWorker.go
Normal file
@@ -0,0 +1,250 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package archiver provides asynchronous job archiving functionality for ClusterCockpit.
|
||||
//
|
||||
// The archiver runs a background worker goroutine that processes job archiving requests
|
||||
// from a buffered channel. When jobs complete, their metric data is archived from the
|
||||
// metric store to the configured archive backend (filesystem, S3, etc.).
|
||||
//
|
||||
// # Architecture
|
||||
//
|
||||
// The archiver uses a producer-consumer pattern:
|
||||
// - Producer: TriggerArchiving() sends jobs to archiveChannel
|
||||
// - Consumer: archivingWorker() processes jobs from the channel
|
||||
// - Coordination: sync.WaitGroup tracks pending archive operations
|
||||
//
|
||||
// # Lifecycle
|
||||
//
|
||||
// 1. Start(repo, ctx) - Initialize worker with context for cancellation
|
||||
// 2. TriggerArchiving(job) - Queue job for archiving (called when job stops)
|
||||
// 3. archivingWorker() - Background goroutine processes jobs
|
||||
// 4. Shutdown(timeout) - Graceful shutdown with timeout
|
||||
//
|
||||
// # Graceful Shutdown
|
||||
//
|
||||
// The archiver supports graceful shutdown with configurable timeout:
|
||||
// - Closes channel to reject new jobs
|
||||
// - Waits for pending jobs to complete (up to timeout)
|
||||
// - Cancels context if timeout exceeded
|
||||
// - Ensures worker goroutine exits cleanly
|
||||
//
|
||||
// # Example Usage
|
||||
//
|
||||
// // Initialize archiver
|
||||
// ctx, cancel := context.WithCancel(context.Background())
|
||||
// defer cancel()
|
||||
// archiver.Start(jobRepository, ctx)
|
||||
//
|
||||
// // Trigger archiving when job completes
|
||||
// archiver.TriggerArchiving(job)
|
||||
//
|
||||
// // Graceful shutdown with 10 second timeout
|
||||
// if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||
// log.Printf("Archiver shutdown timeout: %v", err)
|
||||
// }
|
||||
package archiver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
var (
|
||||
archivePending sync.WaitGroup
|
||||
archiveChannel chan *schema.Job
|
||||
jobRepo *repository.JobRepository
|
||||
shutdownCtx context.Context
|
||||
shutdownCancel context.CancelFunc
|
||||
workerDone chan struct{}
|
||||
)
|
||||
|
||||
// Start initializes the archiver and starts the background worker goroutine.
|
||||
//
|
||||
// The archiver processes job archiving requests asynchronously via a buffered channel.
|
||||
// Jobs are sent to the channel using TriggerArchiving() and processed by the worker.
|
||||
//
|
||||
// Parameters:
|
||||
// - r: JobRepository instance for database operations
|
||||
// - ctx: Context for cancellation (shutdown signal propagation)
|
||||
//
|
||||
// The worker goroutine will run until:
|
||||
// - ctx is cancelled (via parent shutdown)
|
||||
// - archiveChannel is closed (via Shutdown())
|
||||
//
|
||||
// Must be called before TriggerArchiving(). Safe to call only once.
|
||||
func Start(r *repository.JobRepository, ctx context.Context) {
|
||||
shutdownCtx, shutdownCancel = context.WithCancel(ctx)
|
||||
archiveChannel = make(chan *schema.Job, 128)
|
||||
workerDone = make(chan struct{})
|
||||
jobRepo = r
|
||||
|
||||
go archivingWorker()
|
||||
}
|
||||
|
||||
// archivingWorker is the background goroutine that processes job archiving requests.
|
||||
//
|
||||
// The worker loop:
|
||||
// 1. Blocks waiting for jobs on archiveChannel or shutdown signal
|
||||
// 2. Fetches job metadata from repository
|
||||
// 3. Archives job data to configured backend (calls ArchiveJob)
|
||||
// 4. Updates job footprint and energy metrics in database
|
||||
// 5. Marks job as successfully archived
|
||||
// 6. Calls job stop hooks
|
||||
//
|
||||
// The worker exits when:
|
||||
// - shutdownCtx is cancelled (timeout during shutdown)
|
||||
// - archiveChannel is closed (normal shutdown)
|
||||
//
|
||||
// Errors during archiving are logged and the job is marked as failed,
|
||||
// but the worker continues processing other jobs.
|
||||
func archivingWorker() {
|
||||
defer close(workerDone)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-shutdownCtx.Done():
|
||||
cclog.Info("Archive worker received shutdown signal")
|
||||
return
|
||||
|
||||
case job, ok := <-archiveChannel:
|
||||
if !ok {
|
||||
cclog.Info("Archive channel closed, worker exiting")
|
||||
return
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
// not using meta data, called to load JobMeta into Cache?
|
||||
// will fail if job meta not in repository
|
||||
if _, err := jobRepo.FetchMetadata(job); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
|
||||
// ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
|
||||
// Use shutdown context to allow cancellation
|
||||
jobMeta, err := ArchiveJob(job, shutdownCtx)
|
||||
if err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
|
||||
stmt := sq.Update("job").Where("job.id = ?", job.ID)
|
||||
|
||||
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
// Update the jobs database entry one last time:
|
||||
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
|
||||
if err := jobRepo.Execute(stmt); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
cclog.Infof("archiving job (dbid: %d) successful", job.ID)
|
||||
|
||||
repository.CallJobStopHooks(job)
|
||||
archivePending.Done()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TriggerArchiving queues a job for asynchronous archiving.
|
||||
//
|
||||
// This function should be called when a job completes (stops) to archive its
|
||||
// metric data from the metric store to the configured archive backend.
|
||||
//
|
||||
// The function:
|
||||
// 1. Increments the pending job counter (WaitGroup)
|
||||
// 2. Sends the job to the archiving channel (buffered, capacity 128)
|
||||
// 3. Returns immediately (non-blocking unless channel is full)
|
||||
//
|
||||
// The actual archiving is performed asynchronously by the worker goroutine.
|
||||
// Upon completion, the worker will decrement the pending counter.
|
||||
//
|
||||
// Panics if Start() has not been called first.
|
||||
func TriggerArchiving(job *schema.Job) {
|
||||
if archiveChannel == nil {
|
||||
cclog.Fatal("Cannot archive without archiving channel. Did you Start the archiver?")
|
||||
}
|
||||
|
||||
archivePending.Add(1)
|
||||
archiveChannel <- job
|
||||
}
|
||||
|
||||
// Shutdown performs a graceful shutdown of the archiver with a configurable timeout.
|
||||
//
|
||||
// The shutdown process:
|
||||
// 1. Closes archiveChannel - no new jobs will be accepted
|
||||
// 2. Waits for pending jobs to complete (up to timeout duration)
|
||||
// 3. If timeout is exceeded:
|
||||
// - Cancels shutdownCtx to interrupt ongoing ArchiveJob operations
|
||||
// - Returns error indicating timeout
|
||||
// 4. Waits for worker goroutine to exit cleanly
|
||||
//
|
||||
// Parameters:
|
||||
// - timeout: Maximum duration to wait for pending jobs to complete
|
||||
// (recommended: 10-30 seconds for production)
|
||||
//
|
||||
// Returns:
|
||||
// - nil if all jobs completed within timeout
|
||||
// - error if timeout was exceeded (some jobs may not have been archived)
|
||||
//
|
||||
// Jobs that don't complete within the timeout will be marked as failed.
|
||||
// The function always ensures the worker goroutine exits before returning.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||
// log.Printf("Some jobs did not complete: %v", err)
|
||||
// }
|
||||
func Shutdown(timeout time.Duration) error {
|
||||
cclog.Info("Initiating archiver shutdown...")
|
||||
|
||||
// Close channel to signal no more jobs will be accepted
|
||||
close(archiveChannel)
|
||||
|
||||
// Create a channel to signal when all jobs are done
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
archivePending.Wait()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
// Wait for jobs to complete or timeout
|
||||
select {
|
||||
case <-done:
|
||||
cclog.Info("All archive jobs completed successfully")
|
||||
// Wait for worker to exit
|
||||
<-workerDone
|
||||
return nil
|
||||
case <-time.After(timeout):
|
||||
cclog.Warn("Archiver shutdown timeout exceeded, cancelling remaining operations")
|
||||
// Cancel any ongoing operations
|
||||
shutdownCancel()
|
||||
// Wait for worker to exit
|
||||
<-workerDone
|
||||
return fmt.Errorf("archiver shutdown timeout after %v", timeout)
|
||||
}
|
||||
}
|
||||
105
internal/archiver/archiver.go
Normal file
105
internal/archiver/archiver.go
Normal file
@@ -0,0 +1,105 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package archiver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// ArchiveJob archives a completed job's metric data to the configured archive backend.
|
||||
//
|
||||
// This function performs the following operations:
|
||||
// 1. Loads all metric data for the job from the metric data repository
|
||||
// 2. Calculates job-level statistics (avg, min, max) for each metric
|
||||
// 3. Stores the job metadata and metric data to the archive backend
|
||||
//
|
||||
// Metric data is retrieved at the highest available resolution (typically 60s)
|
||||
// for the following scopes:
|
||||
// - Node scope (always)
|
||||
// - Core scope (for jobs with ≤8 nodes, to reduce data volume)
|
||||
// - Accelerator scope (if job used accelerators)
|
||||
//
|
||||
// The function respects context cancellation. If ctx is cancelled (e.g., during
|
||||
// shutdown timeout), the operation will be interrupted and return an error.
|
||||
//
|
||||
// Parameters:
|
||||
// - job: The job to archive (must be a completed job)
|
||||
// - ctx: Context for cancellation and timeout control
|
||||
//
|
||||
// Returns:
|
||||
// - *schema.Job with populated Statistics field
|
||||
// - error if data loading or archiving fails
|
||||
//
|
||||
// If config.Keys.DisableArchive is true, only job statistics are calculated
|
||||
// and returned (no data is written to archive backend).
|
||||
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
|
||||
allMetrics := make([]string, 0)
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
for _, mc := range metricConfigs {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||
// FIXME: Add a config option for this
|
||||
if job.NumNodes <= 8 {
|
||||
// This will add the native scope if core scope is not available
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
}
|
||||
|
||||
if job.NumAcc > 0 {
|
||||
scopes = append(scopes, schema.MetricScopeAccelerator)
|
||||
}
|
||||
|
||||
jobData, err := metricdispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
|
||||
if err != nil {
|
||||
cclog.Error("Error wile loading job data for archiving")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job.Statistics = make(map[string]schema.JobStatistics)
|
||||
|
||||
for metric, data := range jobData {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// This should never happen ?
|
||||
continue
|
||||
}
|
||||
|
||||
for _, series := range nodeData.Series {
|
||||
avg += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
}
|
||||
|
||||
// Round AVG Result to 2 Digits
|
||||
job.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: schema.Unit{
|
||||
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||
},
|
||||
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
|
||||
// If the file based archive is disabled,
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if config.Keys.DisableArchive {
|
||||
return job, nil
|
||||
}
|
||||
|
||||
return job, archive.GetHandle().ImportJob(job, &jobData)
|
||||
}
|
||||
@@ -1,31 +1,121 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package auth implements various authentication methods
|
||||
package auth
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"database/sql"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
"github.com/gorilla/sessions"
|
||||
)
|
||||
|
||||
// Authenticator is the interface for all authentication methods.
|
||||
// Each authenticator determines if it can handle a login request (CanLogin)
|
||||
// and performs the actual authentication (Login).
|
||||
type Authenticator interface {
|
||||
// CanLogin determines if this authenticator can handle the login request.
|
||||
// It returns the user object if available and a boolean indicating if this
|
||||
// authenticator should attempt the login. This method should not perform
|
||||
// expensive operations or actual authentication.
|
||||
CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool)
|
||||
|
||||
// Login performs the actually authentication for the user.
|
||||
// It returns the authenticated user or an error if authentication fails.
|
||||
// The user parameter may be nil if the user doesn't exist in the database yet.
|
||||
Login(user *schema.User, rw http.ResponseWriter, r *http.Request) (*schema.User, error)
|
||||
}
|
||||
|
||||
var (
|
||||
initOnce sync.Once
|
||||
authInstance *Authentication
|
||||
)
|
||||
|
||||
// rateLimiterEntry tracks a rate limiter and its last use time for cleanup
|
||||
type rateLimiterEntry struct {
|
||||
limiter *rate.Limiter
|
||||
lastUsed time.Time
|
||||
}
|
||||
|
||||
var ipUserLimiters sync.Map
|
||||
|
||||
// getIPUserLimiter returns a rate limiter for the given IP and username combination.
|
||||
// Rate limiters are created on demand and track 5 attempts per 15 minutes.
|
||||
func getIPUserLimiter(ip, username string) *rate.Limiter {
|
||||
key := ip + ":" + username
|
||||
now := time.Now()
|
||||
|
||||
if entry, ok := ipUserLimiters.Load(key); ok {
|
||||
rle := entry.(*rateLimiterEntry)
|
||||
rle.lastUsed = now
|
||||
return rle.limiter
|
||||
}
|
||||
|
||||
// More aggressive rate limiting: 5 attempts per 15 minutes
|
||||
newLimiter := rate.NewLimiter(rate.Every(15*time.Minute/5), 5)
|
||||
ipUserLimiters.Store(key, &rateLimiterEntry{
|
||||
limiter: newLimiter,
|
||||
lastUsed: now,
|
||||
})
|
||||
return newLimiter
|
||||
}
|
||||
|
||||
// cleanupOldRateLimiters removes rate limiters that haven't been used recently
|
||||
func cleanupOldRateLimiters(olderThan time.Time) {
|
||||
ipUserLimiters.Range(func(key, value any) bool {
|
||||
entry := value.(*rateLimiterEntry)
|
||||
if entry.lastUsed.Before(olderThan) {
|
||||
ipUserLimiters.Delete(key)
|
||||
cclog.Debugf("Cleaned up rate limiter for %v", key)
|
||||
}
|
||||
return true
|
||||
})
|
||||
}
|
||||
|
||||
// startRateLimiterCleanup starts a background goroutine to clean up old rate limiters
|
||||
func startRateLimiterCleanup() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(1 * time.Hour)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
// Clean up limiters not used in the last 24 hours
|
||||
cleanupOldRateLimiters(time.Now().Add(-24 * time.Hour))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// AuthConfig contains configuration for all authentication methods
|
||||
type AuthConfig struct {
|
||||
LdapConfig *LdapConfig `json:"ldap"`
|
||||
JwtConfig *JWTAuthConfig `json:"jwts"`
|
||||
OpenIDConfig *OpenIDConfig `json:"oidc"`
|
||||
}
|
||||
|
||||
// Keys holds the global authentication configuration
|
||||
var Keys AuthConfig
|
||||
|
||||
// Authentication manages all authentication methods and session handling
|
||||
type Authentication struct {
|
||||
sessionStore *sessions.CookieStore
|
||||
LdapAuth *LdapAuthenticator
|
||||
@@ -41,7 +131,7 @@ func (auth *Authentication) AuthViaSession(
|
||||
) (*schema.User, error) {
|
||||
session, err := auth.sessionStore.Get(r, "session")
|
||||
if err != nil {
|
||||
log.Error("Error while getting session store")
|
||||
cclog.Error("Error while getting session store")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -49,10 +139,31 @@ func (auth *Authentication) AuthViaSession(
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// TODO: Check if session keys exist
|
||||
username, _ := session.Values["username"].(string)
|
||||
projects, _ := session.Values["projects"].([]string)
|
||||
roles, _ := session.Values["roles"].([]string)
|
||||
// Validate session data with proper type checking
|
||||
username, ok := session.Values["username"].(string)
|
||||
if !ok || username == "" {
|
||||
cclog.Warn("Invalid session: missing or invalid username")
|
||||
// Invalidate the corrupted session
|
||||
session.Options.MaxAge = -1
|
||||
_ = auth.sessionStore.Save(r, rw, session)
|
||||
return nil, errors.New("invalid session data")
|
||||
}
|
||||
|
||||
projects, ok := session.Values["projects"].([]string)
|
||||
if !ok {
|
||||
cclog.Warn("Invalid session: projects not found or invalid type, using empty list")
|
||||
projects = []string{}
|
||||
}
|
||||
|
||||
roles, ok := session.Values["roles"].([]string)
|
||||
if !ok || len(roles) == 0 {
|
||||
cclog.Warn("Invalid session: missing or invalid roles")
|
||||
// Invalidate the corrupted session
|
||||
session.Options.MaxAge = -1
|
||||
_ = auth.sessionStore.Save(r, rw, session)
|
||||
return nil, errors.New("invalid session data")
|
||||
}
|
||||
|
||||
return &schema.User{
|
||||
Username: username,
|
||||
Projects: projects,
|
||||
@@ -62,90 +173,131 @@ func (auth *Authentication) AuthViaSession(
|
||||
}, nil
|
||||
}
|
||||
|
||||
func Init() (*Authentication, error) {
|
||||
auth := &Authentication{}
|
||||
func Init(authCfg *json.RawMessage) {
|
||||
initOnce.Do(func() {
|
||||
authInstance = &Authentication{}
|
||||
|
||||
// Start background cleanup of rate limiters
|
||||
startRateLimiterCleanup()
|
||||
|
||||
sessKey := os.Getenv("SESSION_KEY")
|
||||
if sessKey == "" {
|
||||
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
|
||||
bytes := make([]byte, 32)
|
||||
if _, err := rand.Read(bytes); err != nil {
|
||||
log.Error("Error while initializing authentication -> failed to generate random bytes for session key")
|
||||
return nil, err
|
||||
}
|
||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
||||
} else {
|
||||
bytes, err := base64.StdEncoding.DecodeString(sessKey)
|
||||
if err != nil {
|
||||
log.Error("Error while initializing authentication -> decoding session key failed")
|
||||
return nil, err
|
||||
}
|
||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
||||
}
|
||||
|
||||
if config.Keys.LdapConfig != nil {
|
||||
ldapAuth := &LdapAuthenticator{}
|
||||
if err := ldapAuth.Init(); err != nil {
|
||||
log.Warn("Error while initializing authentication -> ldapAuth init failed")
|
||||
sessKey := os.Getenv("SESSION_KEY")
|
||||
if sessKey == "" {
|
||||
cclog.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
|
||||
bytes := make([]byte, 32)
|
||||
if _, err := rand.Read(bytes); err != nil {
|
||||
cclog.Fatal("Error while initializing authentication -> failed to generate random bytes for session key")
|
||||
}
|
||||
authInstance.sessionStore = sessions.NewCookieStore(bytes)
|
||||
} else {
|
||||
auth.LdapAuth = ldapAuth
|
||||
auth.authenticators = append(auth.authenticators, auth.LdapAuth)
|
||||
}
|
||||
} else {
|
||||
log.Info("Missing LDAP configuration: No LDAP support!")
|
||||
}
|
||||
|
||||
if config.Keys.JwtConfig != nil {
|
||||
auth.JwtAuth = &JWTAuthenticator{}
|
||||
if err := auth.JwtAuth.Init(); err != nil {
|
||||
log.Error("Error while initializing authentication -> jwtAuth init failed")
|
||||
return nil, err
|
||||
bytes, err := base64.StdEncoding.DecodeString(sessKey)
|
||||
if err != nil {
|
||||
cclog.Fatal("Error while initializing authentication -> decoding session key failed")
|
||||
}
|
||||
authInstance.sessionStore = sessions.NewCookieStore(bytes)
|
||||
}
|
||||
|
||||
jwtSessionAuth := &JWTSessionAuthenticator{}
|
||||
if err := jwtSessionAuth.Init(); err != nil {
|
||||
log.Info("jwtSessionAuth init failed: No JWT login support!")
|
||||
if d, err := time.ParseDuration(config.Keys.SessionMaxAge); err == nil {
|
||||
authInstance.SessionMaxAge = d
|
||||
}
|
||||
|
||||
if authCfg == nil {
|
||||
return
|
||||
}
|
||||
|
||||
config.Validate(configSchema, *authCfg)
|
||||
dec := json.NewDecoder(bytes.NewReader(*authCfg))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
cclog.Errorf("error while decoding ldap config: %v", err)
|
||||
}
|
||||
|
||||
if Keys.LdapConfig != nil {
|
||||
ldapAuth := &LdapAuthenticator{}
|
||||
if err := ldapAuth.Init(); err != nil {
|
||||
cclog.Warn("Error while initializing authentication -> ldapAuth init failed")
|
||||
} else {
|
||||
authInstance.LdapAuth = ldapAuth
|
||||
authInstance.authenticators = append(authInstance.authenticators, authInstance.LdapAuth)
|
||||
}
|
||||
} else {
|
||||
auth.authenticators = append(auth.authenticators, jwtSessionAuth)
|
||||
cclog.Info("Missing LDAP configuration: No LDAP support!")
|
||||
}
|
||||
|
||||
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
|
||||
if err := jwtCookieSessionAuth.Init(); err != nil {
|
||||
log.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
|
||||
if Keys.JwtConfig != nil {
|
||||
authInstance.JwtAuth = &JWTAuthenticator{}
|
||||
if err := authInstance.JwtAuth.Init(); err != nil {
|
||||
cclog.Fatal("Error while initializing authentication -> jwtAuth init failed")
|
||||
}
|
||||
|
||||
jwtSessionAuth := &JWTSessionAuthenticator{}
|
||||
if err := jwtSessionAuth.Init(); err != nil {
|
||||
cclog.Info("jwtSessionAuth init failed: No JWT login support!")
|
||||
} else {
|
||||
authInstance.authenticators = append(authInstance.authenticators, jwtSessionAuth)
|
||||
}
|
||||
|
||||
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
|
||||
if err := jwtCookieSessionAuth.Init(); err != nil {
|
||||
cclog.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
|
||||
} else {
|
||||
authInstance.authenticators = append(authInstance.authenticators, jwtCookieSessionAuth)
|
||||
}
|
||||
} else {
|
||||
auth.authenticators = append(auth.authenticators, jwtCookieSessionAuth)
|
||||
cclog.Info("Missing JWT configuration: No JWT token support!")
|
||||
}
|
||||
} else {
|
||||
log.Info("Missing JWT configuration: No JWT token support!")
|
||||
}
|
||||
|
||||
auth.LocalAuth = &LocalAuthenticator{}
|
||||
if err := auth.LocalAuth.Init(); err != nil {
|
||||
log.Error("Error while initializing authentication -> localAuth init failed")
|
||||
return nil, err
|
||||
}
|
||||
auth.authenticators = append(auth.authenticators, auth.LocalAuth)
|
||||
|
||||
return auth, nil
|
||||
authInstance.LocalAuth = &LocalAuthenticator{}
|
||||
if err := authInstance.LocalAuth.Init(); err != nil {
|
||||
cclog.Fatal("Error while initializing authentication -> localAuth init failed")
|
||||
}
|
||||
authInstance.authenticators = append(authInstance.authenticators, authInstance.LocalAuth)
|
||||
})
|
||||
}
|
||||
|
||||
func persistUser(user *schema.User) {
|
||||
func GetAuthInstance() *Authentication {
|
||||
if authInstance == nil {
|
||||
cclog.Fatal("Authentication module not initialized!")
|
||||
}
|
||||
|
||||
return authInstance
|
||||
}
|
||||
|
||||
// handleUserSync syncs or updates a user in the database based on configuration.
|
||||
// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled.
|
||||
func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) {
|
||||
r := repository.GetUserRepository()
|
||||
_, err := r.GetUser(user.Username)
|
||||
dbUser, err := r.GetUser(user.Username)
|
||||
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%s': %v", user.Username, err)
|
||||
} else if err == sql.ErrNoRows {
|
||||
cclog.Errorf("Error while loading user '%s': %v", user.Username, err)
|
||||
return
|
||||
}
|
||||
|
||||
if err == sql.ErrNoRows && syncUserOnLogin { // Add new user
|
||||
if err := r.AddUser(user); err != nil {
|
||||
log.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
|
||||
cclog.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
|
||||
}
|
||||
} else if err == nil && updateUserOnLogin { // Update existing user
|
||||
if err := r.UpdateUser(dbUser, user); err != nil {
|
||||
cclog.Errorf("Error while updating user '%s' in DB: %v", dbUser.Username, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleTokenUser syncs JWT token user with database
|
||||
func handleTokenUser(tokenUser *schema.User) {
|
||||
handleUserSync(tokenUser, Keys.JwtConfig.SyncUserOnLogin, Keys.JwtConfig.UpdateUserOnLogin)
|
||||
}
|
||||
|
||||
// handleOIDCUser syncs OIDC user with database
|
||||
func handleOIDCUser(OIDCUser *schema.User) {
|
||||
handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin)
|
||||
}
|
||||
|
||||
func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
|
||||
session, err := auth.sessionStore.New(r, "session")
|
||||
if err != nil {
|
||||
log.Errorf("session creation failed: %s", err.Error())
|
||||
cclog.Errorf("session creation failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return err
|
||||
}
|
||||
@@ -153,11 +305,16 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
||||
if auth.SessionMaxAge != 0 {
|
||||
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
|
||||
}
|
||||
if config.Keys.HTTPSCertFile == "" && config.Keys.HTTPSKeyFile == "" {
|
||||
cclog.Warn("HTTPS not configured - session cookies will not have Secure flag set (insecure for production)")
|
||||
session.Options.Secure = false
|
||||
}
|
||||
session.Options.SameSite = http.SameSiteStrictMode
|
||||
session.Values["username"] = user.Username
|
||||
session.Values["projects"] = user.Projects
|
||||
session.Values["roles"] = user.Roles
|
||||
if err := auth.sessionStore.Save(r, rw, session); err != nil {
|
||||
log.Warnf("session save failed: %s", err.Error())
|
||||
cclog.Warnf("session save failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return err
|
||||
}
|
||||
@@ -166,18 +323,29 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
||||
}
|
||||
|
||||
func (auth *Authentication) Login(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
username := r.FormValue("username")
|
||||
var dbUser *schema.User
|
||||
ip, _, err := net.SplitHostPort(r.RemoteAddr)
|
||||
if err != nil {
|
||||
ip = r.RemoteAddr
|
||||
}
|
||||
|
||||
username := r.FormValue("username")
|
||||
|
||||
limiter := getIPUserLimiter(ip, username)
|
||||
if !limiter.Allow() {
|
||||
cclog.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username)
|
||||
onfailure(rw, r, errors.New("too many login attempts, try again in a few minutes"))
|
||||
return
|
||||
}
|
||||
|
||||
var dbUser *schema.User
|
||||
if username != "" {
|
||||
var err error
|
||||
dbUser, err = repository.GetUserRepository().GetUser(username)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%v'", username)
|
||||
cclog.Errorf("Error while loading user '%v'", username)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,12 +355,12 @@ func (auth *Authentication) Login(
|
||||
if user, ok = authenticator.CanLogin(dbUser, username, rw, r); !ok {
|
||||
continue
|
||||
} else {
|
||||
log.Debugf("Can login with user %v", user)
|
||||
cclog.Debugf("Can login with user %v", user)
|
||||
}
|
||||
|
||||
user, err := authenticator.Login(user, rw, r)
|
||||
if err != nil {
|
||||
log.Warnf("user login failed: %s", err.Error())
|
||||
cclog.Warnf("user login failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
@@ -201,13 +369,19 @@ func (auth *Authentication) Login(
|
||||
return
|
||||
}
|
||||
|
||||
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
|
||||
if r.FormValue("redirect") != "" {
|
||||
http.RedirectHandler(r.FormValue("redirect"), http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
http.RedirectHandler("/", http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
log.Debugf("login failed: no authenticator applied")
|
||||
cclog.Debugf("login failed: no authenticator applied")
|
||||
onfailure(rw, r, errors.New("no authenticator applied"))
|
||||
})
|
||||
}
|
||||
@@ -219,31 +393,186 @@ func (auth *Authentication) Auth(
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("authentication failed: %s", err.Error())
|
||||
cclog.Infof("auth -> authentication failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
|
||||
if user == nil {
|
||||
user, err = auth.AuthViaSession(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("authentication failed: %s", err.Error())
|
||||
cclog.Infof("auth -> authentication failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if user != nil {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
log.Debug("authentication failed")
|
||||
cclog.Info("auth -> authentication failed")
|
||||
onfailure(rw, r, errors.New("unauthorized (please login first)"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
|
||||
ipErr := securedCheck(user, r)
|
||||
if ipErr != nil {
|
||||
cclog.Infof("auth api -> secured check failed: %s", ipErr.Error())
|
||||
onfailure(rw, r, ipErr)
|
||||
return
|
||||
}
|
||||
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
default:
|
||||
cclog.Info("auth api -> authentication failed: missing role")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
}
|
||||
}
|
||||
cclog.Info("auth api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthUserAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth user api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
default:
|
||||
cclog.Info("auth user api -> authentication failed: missing role")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
}
|
||||
}
|
||||
cclog.Info("auth user api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthMetricStoreAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth metricstore api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
default:
|
||||
cclog.Info("auth metricstore api -> authentication failed: missing role")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
}
|
||||
}
|
||||
cclog.Info("auth metricstore api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthConfigAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.AuthViaSession(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth config api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
if user != nil && user.HasRole(schema.RoleAdmin) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
cclog.Info("auth config api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthFrontendAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.AuthViaSession(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth frontend api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
if user != nil {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
cclog.Info("auth frontend api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
session, err := auth.sessionStore.Get(r, "session")
|
||||
@@ -263,3 +592,42 @@ func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
|
||||
onsuccess.ServeHTTP(rw, r)
|
||||
})
|
||||
}
|
||||
|
||||
// Helper Moved To MiddleWare Auth Handlers
|
||||
func securedCheck(user *schema.User, r *http.Request) error {
|
||||
if user == nil {
|
||||
return fmt.Errorf("no user for secured check")
|
||||
}
|
||||
|
||||
// extract IP address for checking
|
||||
IPAddress := r.Header.Get("X-Real-Ip")
|
||||
if IPAddress == "" {
|
||||
IPAddress = r.Header.Get("X-Forwarded-For")
|
||||
}
|
||||
if IPAddress == "" {
|
||||
IPAddress = r.RemoteAddr
|
||||
}
|
||||
|
||||
// Handle both IPv4 and IPv6 addresses properly
|
||||
// For IPv6, this will strip the port and brackets
|
||||
// For IPv4, this will strip the port
|
||||
if host, _, err := net.SplitHostPort(IPAddress); err == nil {
|
||||
IPAddress = host
|
||||
}
|
||||
// If SplitHostPort fails, IPAddress is already just a host (no port)
|
||||
|
||||
// If nothing declared in config: deny all request to this api endpoint
|
||||
if len(config.Keys.APIAllowedIPs) == 0 {
|
||||
return fmt.Errorf("missing configuration key ApiAllowedIPs")
|
||||
}
|
||||
// If wildcard declared in config: Continue
|
||||
if config.Keys.APIAllowedIPs[0] == "*" {
|
||||
return nil
|
||||
}
|
||||
// check if IP is allowed
|
||||
if !util.Contains(config.Keys.APIAllowedIPs, IPAddress) {
|
||||
return fmt.Errorf("unknown ip: %v", IPAddress)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
176
internal/auth/auth_test.go
Normal file
176
internal/auth/auth_test.go
Normal file
@@ -0,0 +1,176 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
"net"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestGetIPUserLimiter tests the rate limiter creation and retrieval
|
||||
func TestGetIPUserLimiter(t *testing.T) {
|
||||
ip := "192.168.1.1"
|
||||
username := "testuser"
|
||||
|
||||
// Get limiter for the first time
|
||||
limiter1 := getIPUserLimiter(ip, username)
|
||||
if limiter1 == nil {
|
||||
t.Fatal("Expected limiter to be created")
|
||||
}
|
||||
|
||||
// Get the same limiter again
|
||||
limiter2 := getIPUserLimiter(ip, username)
|
||||
if limiter1 != limiter2 {
|
||||
t.Error("Expected to get the same limiter instance")
|
||||
}
|
||||
|
||||
// Get a different limiter for different user
|
||||
limiter3 := getIPUserLimiter(ip, "otheruser")
|
||||
if limiter1 == limiter3 {
|
||||
t.Error("Expected different limiter for different user")
|
||||
}
|
||||
|
||||
// Get a different limiter for different IP
|
||||
limiter4 := getIPUserLimiter("192.168.1.2", username)
|
||||
if limiter1 == limiter4 {
|
||||
t.Error("Expected different limiter for different IP")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRateLimiterBehavior tests that rate limiting works correctly
|
||||
func TestRateLimiterBehavior(t *testing.T) {
|
||||
ip := "10.0.0.1"
|
||||
username := "ratelimituser"
|
||||
|
||||
limiter := getIPUserLimiter(ip, username)
|
||||
|
||||
// Should allow first 5 attempts
|
||||
for i := 0; i < 5; i++ {
|
||||
if !limiter.Allow() {
|
||||
t.Errorf("Request %d should be allowed within rate limit", i+1)
|
||||
}
|
||||
}
|
||||
|
||||
// 6th attempt should be blocked
|
||||
if limiter.Allow() {
|
||||
t.Error("Request 6 should be blocked by rate limiter")
|
||||
}
|
||||
}
|
||||
|
||||
// TestCleanupOldRateLimiters tests the cleanup function
|
||||
func TestCleanupOldRateLimiters(t *testing.T) {
|
||||
// Clear all existing limiters first to avoid interference from other tests
|
||||
cleanupOldRateLimiters(time.Now().Add(24 * time.Hour))
|
||||
|
||||
// Create some new rate limiters
|
||||
limiter1 := getIPUserLimiter("1.1.1.1", "user1")
|
||||
limiter2 := getIPUserLimiter("2.2.2.2", "user2")
|
||||
|
||||
if limiter1 == nil || limiter2 == nil {
|
||||
t.Fatal("Failed to create test limiters")
|
||||
}
|
||||
|
||||
// Cleanup limiters older than 1 second from now (should keep both)
|
||||
time.Sleep(10 * time.Millisecond) // Small delay to ensure timestamp difference
|
||||
cleanupOldRateLimiters(time.Now().Add(-1 * time.Second))
|
||||
|
||||
// Verify they still exist (should get same instance)
|
||||
if getIPUserLimiter("1.1.1.1", "user1") != limiter1 {
|
||||
t.Error("Limiter 1 was incorrectly cleaned up")
|
||||
}
|
||||
if getIPUserLimiter("2.2.2.2", "user2") != limiter2 {
|
||||
t.Error("Limiter 2 was incorrectly cleaned up")
|
||||
}
|
||||
|
||||
// Cleanup limiters older than 1 hour from now (should remove both)
|
||||
cleanupOldRateLimiters(time.Now().Add(2 * time.Hour))
|
||||
|
||||
// Getting them again should create new instances
|
||||
newLimiter1 := getIPUserLimiter("1.1.1.1", "user1")
|
||||
if newLimiter1 == limiter1 {
|
||||
t.Error("Old limiter should have been cleaned up")
|
||||
}
|
||||
}
|
||||
|
||||
// TestIPv4Extraction tests extracting IPv4 addresses
|
||||
func TestIPv4Extraction(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{"IPv4 with port", "192.168.1.1:8080", "192.168.1.1"},
|
||||
{"IPv4 without port", "192.168.1.1", "192.168.1.1"},
|
||||
{"Localhost with port", "127.0.0.1:3000", "127.0.0.1"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.input
|
||||
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||
result = host
|
||||
}
|
||||
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIPv6Extraction tests extracting IPv6 addresses
|
||||
func TestIPv6Extraction(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{"IPv6 with port", "[2001:db8::1]:8080", "2001:db8::1"},
|
||||
{"IPv6 localhost with port", "[::1]:3000", "::1"},
|
||||
{"IPv6 without port", "2001:db8::1", "2001:db8::1"},
|
||||
{"IPv6 localhost", "::1", "::1"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.input
|
||||
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||
result = host
|
||||
}
|
||||
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIPExtractionEdgeCases tests edge cases for IP extraction
|
||||
func TestIPExtractionEdgeCases(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{"Hostname without port", "example.com", "example.com"},
|
||||
{"Empty string", "", ""},
|
||||
{"Just port", ":8080", ""},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.input
|
||||
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||
result = host
|
||||
}
|
||||
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
@@ -13,13 +14,33 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
type JWTAuthConfig struct {
|
||||
// Specifies for how long a JWT token shall be valid
|
||||
// as a string parsable by time.ParseDuration().
|
||||
MaxAge string `json:"max-age"`
|
||||
|
||||
// Specifies which cookie should be checked for a JWT token (if no authorization header is present)
|
||||
CookieName string `json:"cookieName"`
|
||||
|
||||
// Deny login for users not in database (but defined in JWT).
|
||||
// Ignore user roles defined in JWTs ('roles' claim), get them from db.
|
||||
ValidateUser bool `json:"validateUser"`
|
||||
|
||||
// Specifies which issuer should be accepted when validating external JWTs ('iss' claim)
|
||||
TrustedIssuer string `json:"trustedIssuer"`
|
||||
|
||||
// Should an non-existent user be added to the DB based on the information in the token
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
|
||||
// Should an existent user be updated in the DB based on the information in the token
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
}
|
||||
|
||||
type JWTAuthenticator struct {
|
||||
publicKey ed25519.PublicKey
|
||||
privateKey ed25519.PrivateKey
|
||||
@@ -28,17 +49,17 @@ type JWTAuthenticator struct {
|
||||
func (ja *JWTAuthenticator) Init() error {
|
||||
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
|
||||
if pubKey == "" || privKey == "" {
|
||||
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
} else {
|
||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode JWT public key")
|
||||
cclog.Warn("Could not decode JWT public key")
|
||||
return err
|
||||
}
|
||||
ja.publicKey = ed25519.PublicKey(bytes)
|
||||
bytes, err = base64.StdEncoding.DecodeString(privKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode JWT private key")
|
||||
cclog.Warn("Could not decode JWT private key")
|
||||
return err
|
||||
}
|
||||
ja.privateKey = ed25519.PrivateKey(bytes)
|
||||
@@ -62,7 +83,7 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
|
||||
if t.Method != jwt.SigningMethodEdDSA {
|
||||
return nil, errors.New("only Ed25519/EdDSA supported")
|
||||
}
|
||||
@@ -70,51 +91,34 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
||||
return ja.publicKey, nil
|
||||
})
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing JWT token")
|
||||
cclog.Warn("Error while parsing JWT token")
|
||||
return nil, err
|
||||
}
|
||||
if !token.Valid {
|
||||
log.Warn("jwt token claims are not valid")
|
||||
cclog.Warn("jwt token claims are not valid")
|
||||
return nil, errors.New("jwt token claims are not valid")
|
||||
}
|
||||
|
||||
// Token is valid, extract payload
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
sub, _ := claims["sub"].(string)
|
||||
|
||||
var roles []string
|
||||
|
||||
// Validate user + roles from JWT against database?
|
||||
if config.Keys.JwtConfig.ValidateUser {
|
||||
ur := repository.GetUserRepository()
|
||||
user, err := ur.GetUser(sub)
|
||||
// Deny any logins for unknown usernames
|
||||
if err != nil {
|
||||
log.Warn("Could not find user from JWT in internal database.")
|
||||
return nil, errors.New("unknown user")
|
||||
}
|
||||
// Take user roles from database instead of trusting the JWT
|
||||
roles = user.Roles
|
||||
} else {
|
||||
// Extract roles from JWT (if present)
|
||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
roles = append(roles, r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use shared helper to get user from JWT claims
|
||||
var user *schema.User
|
||||
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthToken, -1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &schema.User{
|
||||
Username: sub,
|
||||
Roles: roles,
|
||||
AuthType: schema.AuthToken,
|
||||
AuthSource: -1,
|
||||
}, nil
|
||||
|
||||
// If not validating user, we only get roles from JWT (no projects for this auth method)
|
||||
if !Keys.JwtConfig.ValidateUser {
|
||||
user.Roles = extractRolesFromClaims(claims, false)
|
||||
user.Projects = nil // Standard JWT auth doesn't include projects
|
||||
}
|
||||
|
||||
return user, nil
|
||||
}
|
||||
|
||||
// Generate a new JWT that can be used for authentication
|
||||
// ProvideJWT generates a new JWT that can be used for authentication
|
||||
func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
|
||||
if ja.privateKey == nil {
|
||||
return "", errors.New("environment variable 'JWT_PRIVATE_KEY' not set")
|
||||
@@ -126,8 +130,8 @@ func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
|
||||
"roles": user.Roles,
|
||||
"iat": now.Unix(),
|
||||
}
|
||||
if config.Keys.JwtConfig.MaxAge != "" {
|
||||
d, err := time.ParseDuration(config.Keys.JwtConfig.MaxAge)
|
||||
if Keys.JwtConfig.MaxAge != "" {
|
||||
d, err := time.ParseDuration(Keys.JwtConfig.MaxAge)
|
||||
if err != nil {
|
||||
return "", errors.New("cannot parse max-age config key")
|
||||
}
|
||||
|
||||
@@ -1,22 +1,19 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
"crypto/ed25519"
|
||||
"database/sql"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -31,18 +28,18 @@ var _ Authenticator = (*JWTCookieSessionAuthenticator)(nil)
|
||||
func (ja *JWTCookieSessionAuthenticator) Init() error {
|
||||
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
|
||||
if pubKey == "" || privKey == "" {
|
||||
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
return errors.New("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
} else {
|
||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode JWT public key")
|
||||
cclog.Warn("Could not decode JWT public key")
|
||||
return err
|
||||
}
|
||||
ja.publicKey = ed25519.PublicKey(bytes)
|
||||
bytes, err = base64.StdEncoding.DecodeString(privKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode JWT private key")
|
||||
cclog.Warn("Could not decode JWT private key")
|
||||
return err
|
||||
}
|
||||
ja.privateKey = ed25519.PrivateKey(bytes)
|
||||
@@ -53,36 +50,35 @@ func (ja *JWTCookieSessionAuthenticator) Init() error {
|
||||
if keyFound && pubKeyCrossLogin != "" {
|
||||
bytes, err := base64.StdEncoding.DecodeString(pubKeyCrossLogin)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode cross login JWT public key")
|
||||
cclog.Warn("Could not decode cross login JWT public key")
|
||||
return err
|
||||
}
|
||||
ja.publicKeyCrossLogin = ed25519.PublicKey(bytes)
|
||||
} else {
|
||||
ja.publicKeyCrossLogin = nil
|
||||
log.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
||||
cclog.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
||||
return errors.New("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
||||
}
|
||||
|
||||
jc := config.Keys.JwtConfig
|
||||
// Warn if other necessary settings are not configured
|
||||
if jc != nil {
|
||||
if jc.CookieName == "" {
|
||||
log.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
if Keys.JwtConfig != nil {
|
||||
if Keys.JwtConfig.CookieName == "" {
|
||||
cclog.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
return errors.New("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
}
|
||||
if !jc.ValidateUser {
|
||||
log.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
|
||||
if !Keys.JwtConfig.ValidateUser {
|
||||
cclog.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
|
||||
}
|
||||
if jc.TrustedIssuer == "" {
|
||||
log.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
if Keys.JwtConfig.TrustedIssuer == "" {
|
||||
cclog.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
return errors.New("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
}
|
||||
} else {
|
||||
log.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
cclog.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
return errors.New("config for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
}
|
||||
|
||||
log.Info("JWT Cookie Session authenticator successfully registered")
|
||||
cclog.Info("JWT Cookie Session authenticator successfully registered")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -92,7 +88,7 @@ func (ja *JWTCookieSessionAuthenticator) CanLogin(
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request,
|
||||
) (*schema.User, bool) {
|
||||
jc := config.Keys.JwtConfig
|
||||
jc := Keys.JwtConfig
|
||||
cookieName := ""
|
||||
if jc.CookieName != "" {
|
||||
cookieName = jc.CookieName
|
||||
@@ -115,7 +111,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request,
|
||||
) (*schema.User, error) {
|
||||
jc := config.Keys.JwtConfig
|
||||
jc := Keys.JwtConfig
|
||||
jwtCookie, err := r.Cookie(jc.CookieName)
|
||||
var rawtoken string
|
||||
|
||||
@@ -123,7 +119,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
rawtoken = jwtCookie.Value
|
||||
}
|
||||
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
|
||||
if t.Method != jwt.SigningMethodEdDSA {
|
||||
return nil, errors.New("only Ed25519/EdDSA supported")
|
||||
}
|
||||
@@ -140,67 +136,26 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
return ja.publicKey, nil
|
||||
})
|
||||
if err != nil {
|
||||
log.Warn("JWT cookie session: error while parsing token")
|
||||
cclog.Warn("JWT cookie session: error while parsing token")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !token.Valid {
|
||||
log.Warn("jwt token claims are not valid")
|
||||
cclog.Warn("jwt token claims are not valid")
|
||||
return nil, errors.New("jwt token claims are not valid")
|
||||
}
|
||||
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
sub, _ := claims["sub"].(string)
|
||||
|
||||
var roles []string
|
||||
projects := make([]string, 0)
|
||||
|
||||
if jc.ValidateUser {
|
||||
var err error
|
||||
user, err = repository.GetUserRepository().GetUser(sub)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%v'", sub)
|
||||
}
|
||||
|
||||
// Deny any logins for unknown usernames
|
||||
if user == nil {
|
||||
log.Warn("Could not find user from JWT in internal database.")
|
||||
return nil, errors.New("unknown user")
|
||||
}
|
||||
} else {
|
||||
var name string
|
||||
if wrap, ok := claims["name"].(map[string]interface{}); ok {
|
||||
if vals, ok := wrap["values"].([]interface{}); ok {
|
||||
if len(vals) != 0 {
|
||||
name = fmt.Sprintf("%v", vals[0])
|
||||
|
||||
for i := 1; i < len(vals); i++ {
|
||||
name += fmt.Sprintf(" %v", vals[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract roles from JWT (if present)
|
||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
roles = append(roles, r)
|
||||
}
|
||||
}
|
||||
}
|
||||
user = &schema.User{
|
||||
Username: sub,
|
||||
Name: name,
|
||||
Roles: roles,
|
||||
Projects: projects,
|
||||
AuthType: schema.AuthSession,
|
||||
AuthSource: schema.AuthViaToken,
|
||||
}
|
||||
|
||||
if jc.SyncUserOnLogin {
|
||||
persistUser(user)
|
||||
}
|
||||
|
||||
// Use shared helper to get user from JWT claims
|
||||
user, err = getUserFromJWT(claims, jc.ValidateUser, schema.AuthSession, schema.AuthViaToken)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Sync or update user if configured
|
||||
if !jc.ValidateUser && (jc.SyncUserOnLogin || jc.UpdateUserOnLogin) {
|
||||
handleTokenUser(user)
|
||||
}
|
||||
|
||||
// (Ask browser to) Delete JWT cookie
|
||||
|
||||
136
internal/auth/jwtHelpers.go
Normal file
136
internal/auth/jwtHelpers.go
Normal file
@@ -0,0 +1,136 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
// extractStringFromClaims extracts a string value from JWT claims
|
||||
func extractStringFromClaims(claims jwt.MapClaims, key string) string {
|
||||
if val, ok := claims[key].(string); ok {
|
||||
return val
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractRolesFromClaims extracts roles from JWT claims
|
||||
// If validateRoles is true, only valid roles are returned
|
||||
func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
|
||||
var roles []string
|
||||
|
||||
if rawroles, ok := claims["roles"].([]any); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
if validateRoles {
|
||||
if schema.IsValidRole(r) {
|
||||
roles = append(roles, r)
|
||||
}
|
||||
} else {
|
||||
roles = append(roles, r)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return roles
|
||||
}
|
||||
|
||||
// extractProjectsFromClaims extracts projects from JWT claims
|
||||
func extractProjectsFromClaims(claims jwt.MapClaims) []string {
|
||||
projects := make([]string, 0)
|
||||
|
||||
if rawprojs, ok := claims["projects"].([]any); ok {
|
||||
for _, pp := range rawprojs {
|
||||
if p, ok := pp.(string); ok {
|
||||
projects = append(projects, p)
|
||||
}
|
||||
}
|
||||
} else if rawprojs, ok := claims["projects"]; ok {
|
||||
if projSlice, ok := rawprojs.([]string); ok {
|
||||
projects = append(projects, projSlice...)
|
||||
}
|
||||
}
|
||||
|
||||
return projects
|
||||
}
|
||||
|
||||
// extractNameFromClaims extracts name from JWT claims
|
||||
// Handles both simple string and complex nested structure
|
||||
func extractNameFromClaims(claims jwt.MapClaims) string {
|
||||
// Try simple string first
|
||||
if name, ok := claims["name"].(string); ok {
|
||||
return name
|
||||
}
|
||||
|
||||
// Try nested structure: {name: {values: [...]}}
|
||||
if wrap, ok := claims["name"].(map[string]any); ok {
|
||||
if vals, ok := wrap["values"].([]any); ok {
|
||||
if len(vals) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%v", vals[0])
|
||||
for i := 1; i < len(vals); i++ {
|
||||
name += fmt.Sprintf(" %v", vals[i])
|
||||
}
|
||||
return name
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// getUserFromJWT creates or retrieves a user based on JWT claims
|
||||
// If validateUser is true, the user must exist in the database
|
||||
// Otherwise, a new user object is created from claims
|
||||
// authSource should be a schema.AuthSource constant (like schema.AuthViaToken)
|
||||
func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.AuthType, authSource schema.AuthSource) (*schema.User, error) {
|
||||
sub := extractStringFromClaims(claims, "sub")
|
||||
if sub == "" {
|
||||
return nil, errors.New("missing 'sub' claim in JWT")
|
||||
}
|
||||
|
||||
if validateUser {
|
||||
// Validate user against database
|
||||
ur := repository.GetUserRepository()
|
||||
user, err := ur.GetUser(sub)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
cclog.Errorf("Error while loading user '%v': %v", sub, err)
|
||||
return nil, fmt.Errorf("database error: %w", err)
|
||||
}
|
||||
|
||||
// Deny any logins for unknown usernames
|
||||
if user == nil || err == sql.ErrNoRows {
|
||||
cclog.Warn("Could not find user from JWT in internal database.")
|
||||
return nil, errors.New("unknown user")
|
||||
}
|
||||
|
||||
// Return database user (with database roles)
|
||||
return user, nil
|
||||
}
|
||||
|
||||
// Create user from JWT claims
|
||||
name := extractNameFromClaims(claims)
|
||||
roles := extractRolesFromClaims(claims, true) // Validate roles
|
||||
projects := extractProjectsFromClaims(claims)
|
||||
|
||||
return &schema.User{
|
||||
Username: sub,
|
||||
Name: name,
|
||||
Roles: roles,
|
||||
Projects: projects,
|
||||
AuthType: authType,
|
||||
AuthSource: authSource,
|
||||
}, nil
|
||||
}
|
||||
281
internal/auth/jwtHelpers_test.go
Normal file
281
internal/auth/jwtHelpers_test.go
Normal file
@@ -0,0 +1,281 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
// TestExtractStringFromClaims tests extracting string values from JWT claims
|
||||
func TestExtractStringFromClaims(t *testing.T) {
|
||||
claims := jwt.MapClaims{
|
||||
"sub": "testuser",
|
||||
"email": "test@example.com",
|
||||
"age": 25, // not a string
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
key string
|
||||
expected string
|
||||
}{
|
||||
{"Existing string", "sub", "testuser"},
|
||||
{"Another string", "email", "test@example.com"},
|
||||
{"Non-existent key", "missing", ""},
|
||||
{"Non-string value", "age", ""},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractStringFromClaims(claims, tt.key)
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestExtractRolesFromClaims tests role extraction and validation
|
||||
func TestExtractRolesFromClaims(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
claims jwt.MapClaims
|
||||
validateRoles bool
|
||||
expected []string
|
||||
}{
|
||||
{
|
||||
name: "Valid roles without validation",
|
||||
claims: jwt.MapClaims{
|
||||
"roles": []any{"admin", "user", "invalid_role"},
|
||||
},
|
||||
validateRoles: false,
|
||||
expected: []string{"admin", "user", "invalid_role"},
|
||||
},
|
||||
{
|
||||
name: "Valid roles with validation",
|
||||
claims: jwt.MapClaims{
|
||||
"roles": []any{"admin", "user", "api"},
|
||||
},
|
||||
validateRoles: true,
|
||||
expected: []string{"admin", "user", "api"},
|
||||
},
|
||||
{
|
||||
name: "Invalid roles with validation",
|
||||
claims: jwt.MapClaims{
|
||||
"roles": []any{"invalid_role", "fake_role"},
|
||||
},
|
||||
validateRoles: true,
|
||||
expected: []string{}, // Should filter out invalid roles
|
||||
},
|
||||
{
|
||||
name: "No roles claim",
|
||||
claims: jwt.MapClaims{},
|
||||
validateRoles: false,
|
||||
expected: []string{},
|
||||
},
|
||||
{
|
||||
name: "Non-array roles",
|
||||
claims: jwt.MapClaims{
|
||||
"roles": "admin",
|
||||
},
|
||||
validateRoles: false,
|
||||
expected: []string{},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractRolesFromClaims(tt.claims, tt.validateRoles)
|
||||
|
||||
if len(result) != len(tt.expected) {
|
||||
t.Errorf("Expected %d roles, got %d", len(tt.expected), len(result))
|
||||
return
|
||||
}
|
||||
|
||||
for i, role := range result {
|
||||
if i >= len(tt.expected) || role != tt.expected[i] {
|
||||
t.Errorf("Expected role %s at position %d, got %s", tt.expected[i], i, role)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestExtractProjectsFromClaims tests project extraction from claims
|
||||
func TestExtractProjectsFromClaims(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
claims jwt.MapClaims
|
||||
expected []string
|
||||
}{
|
||||
{
|
||||
name: "Projects as array of interfaces",
|
||||
claims: jwt.MapClaims{
|
||||
"projects": []any{"project1", "project2", "project3"},
|
||||
},
|
||||
expected: []string{"project1", "project2", "project3"},
|
||||
},
|
||||
{
|
||||
name: "Projects as string array",
|
||||
claims: jwt.MapClaims{
|
||||
"projects": []string{"projectA", "projectB"},
|
||||
},
|
||||
expected: []string{"projectA", "projectB"},
|
||||
},
|
||||
{
|
||||
name: "No projects claim",
|
||||
claims: jwt.MapClaims{},
|
||||
expected: []string{},
|
||||
},
|
||||
{
|
||||
name: "Mixed types in projects array",
|
||||
claims: jwt.MapClaims{
|
||||
"projects": []any{"project1", 123, "project2"},
|
||||
},
|
||||
expected: []string{"project1", "project2"}, // Should skip non-strings
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractProjectsFromClaims(tt.claims)
|
||||
|
||||
if len(result) != len(tt.expected) {
|
||||
t.Errorf("Expected %d projects, got %d", len(tt.expected), len(result))
|
||||
return
|
||||
}
|
||||
|
||||
for i, project := range result {
|
||||
if i >= len(tt.expected) || project != tt.expected[i] {
|
||||
t.Errorf("Expected project %s at position %d, got %s", tt.expected[i], i, project)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestExtractNameFromClaims tests name extraction from various formats
|
||||
func TestExtractNameFromClaims(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
claims jwt.MapClaims
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "Simple string name",
|
||||
claims: jwt.MapClaims{
|
||||
"name": "John Doe",
|
||||
},
|
||||
expected: "John Doe",
|
||||
},
|
||||
{
|
||||
name: "Nested name structure",
|
||||
claims: jwt.MapClaims{
|
||||
"name": map[string]any{
|
||||
"values": []any{"John", "Doe"},
|
||||
},
|
||||
},
|
||||
expected: "John Doe",
|
||||
},
|
||||
{
|
||||
name: "Nested name with single value",
|
||||
claims: jwt.MapClaims{
|
||||
"name": map[string]any{
|
||||
"values": []any{"Alice"},
|
||||
},
|
||||
},
|
||||
expected: "Alice",
|
||||
},
|
||||
{
|
||||
name: "No name claim",
|
||||
claims: jwt.MapClaims{},
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "Empty nested values",
|
||||
claims: jwt.MapClaims{
|
||||
"name": map[string]any{
|
||||
"values": []any{},
|
||||
},
|
||||
},
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "Nested with non-string values",
|
||||
claims: jwt.MapClaims{
|
||||
"name": map[string]any{
|
||||
"values": []any{123, "Smith"},
|
||||
},
|
||||
},
|
||||
expected: "123 Smith", // Should convert to string
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractNameFromClaims(tt.claims)
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected '%s', got '%s'", tt.expected, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetUserFromJWT_NoValidation tests getUserFromJWT without database validation
|
||||
func TestGetUserFromJWT_NoValidation(t *testing.T) {
|
||||
claims := jwt.MapClaims{
|
||||
"sub": "testuser",
|
||||
"name": "Test User",
|
||||
"roles": []any{"user", "admin"},
|
||||
"projects": []any{"project1", "project2"},
|
||||
}
|
||||
|
||||
user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if user.Username != "testuser" {
|
||||
t.Errorf("Expected username 'testuser', got '%s'", user.Username)
|
||||
}
|
||||
|
||||
if user.Name != "Test User" {
|
||||
t.Errorf("Expected name 'Test User', got '%s'", user.Name)
|
||||
}
|
||||
|
||||
if len(user.Roles) != 2 {
|
||||
t.Errorf("Expected 2 roles, got %d", len(user.Roles))
|
||||
}
|
||||
|
||||
if len(user.Projects) != 2 {
|
||||
t.Errorf("Expected 2 projects, got %d", len(user.Projects))
|
||||
}
|
||||
|
||||
if user.AuthType != schema.AuthToken {
|
||||
t.Errorf("Expected AuthType %v, got %v", schema.AuthToken, user.AuthType)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetUserFromJWT_MissingSub tests error when sub claim is missing
|
||||
func TestGetUserFromJWT_MissingSub(t *testing.T) {
|
||||
claims := jwt.MapClaims{
|
||||
"name": "Test User",
|
||||
}
|
||||
|
||||
_, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
|
||||
|
||||
if err == nil {
|
||||
t.Error("Expected error for missing sub claim")
|
||||
}
|
||||
|
||||
if err.Error() != "missing 'sub' claim in JWT" {
|
||||
t.Errorf("Expected specific error message, got: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,11 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -13,10 +13,8 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -30,13 +28,13 @@ func (ja *JWTSessionAuthenticator) Init() error {
|
||||
if pubKey := os.Getenv("CROSS_LOGIN_JWT_HS512_KEY"); pubKey != "" {
|
||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode cross login JWT HS512 key")
|
||||
cclog.Warn("Could not decode cross login JWT HS512 key")
|
||||
return err
|
||||
}
|
||||
ja.loginTokenKey = bytes
|
||||
}
|
||||
|
||||
log.Info("JWT Session authenticator successfully registered")
|
||||
cclog.Info("JWT Session authenticator successfully registered")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -60,87 +58,33 @@ func (ja *JWTSessionAuthenticator) Login(
|
||||
rawtoken = r.URL.Query().Get("login-token")
|
||||
}
|
||||
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
|
||||
if t.Method == jwt.SigningMethodHS256 || t.Method == jwt.SigningMethodHS512 {
|
||||
return ja.loginTokenKey, nil
|
||||
}
|
||||
return nil, fmt.Errorf("unkown signing method for login token: %s (known: HS256, HS512, EdDSA)", t.Method.Alg())
|
||||
})
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing jwt token")
|
||||
cclog.Warn("Error while parsing jwt token")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !token.Valid {
|
||||
log.Warn("jwt token claims are not valid")
|
||||
cclog.Warn("jwt token claims are not valid")
|
||||
return nil, errors.New("jwt token claims are not valid")
|
||||
}
|
||||
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
sub, _ := claims["sub"].(string)
|
||||
|
||||
var roles []string
|
||||
projects := make([]string, 0)
|
||||
|
||||
if config.Keys.JwtConfig.ValidateUser {
|
||||
var err error
|
||||
user, err = repository.GetUserRepository().GetUser(sub)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%v'", sub)
|
||||
}
|
||||
|
||||
// Deny any logins for unknown usernames
|
||||
if user == nil {
|
||||
log.Warn("Could not find user from JWT in internal database.")
|
||||
return nil, errors.New("unknown user")
|
||||
}
|
||||
} else {
|
||||
var name string
|
||||
if wrap, ok := claims["name"].(map[string]interface{}); ok {
|
||||
if vals, ok := wrap["values"].([]interface{}); ok {
|
||||
if len(vals) != 0 {
|
||||
name = fmt.Sprintf("%v", vals[0])
|
||||
|
||||
for i := 1; i < len(vals); i++ {
|
||||
name += fmt.Sprintf(" %v", vals[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract roles from JWT (if present)
|
||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
if schema.IsValidRole(r) {
|
||||
roles = append(roles, r)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if rawprojs, ok := claims["projects"].([]interface{}); ok {
|
||||
for _, pp := range rawprojs {
|
||||
if p, ok := pp.(string); ok {
|
||||
projects = append(projects, p)
|
||||
}
|
||||
}
|
||||
} else if rawprojs, ok := claims["projects"]; ok {
|
||||
projects = append(projects, rawprojs.([]string)...)
|
||||
}
|
||||
|
||||
user = &schema.User{
|
||||
Username: sub,
|
||||
Name: name,
|
||||
Roles: roles,
|
||||
Projects: projects,
|
||||
AuthType: schema.AuthSession,
|
||||
AuthSource: schema.AuthViaToken,
|
||||
}
|
||||
|
||||
if config.Keys.JwtConfig.SyncUserOnLogin {
|
||||
persistUser(user)
|
||||
}
|
||||
|
||||
// Use shared helper to get user from JWT claims
|
||||
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthSession, schema.AuthViaToken)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Sync or update user if configured
|
||||
if !Keys.JwtConfig.ValidateUser && (Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin) {
|
||||
handleTokenUser(user)
|
||||
}
|
||||
|
||||
return user, nil
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
@@ -10,15 +11,27 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/go-ldap/ldap/v3"
|
||||
)
|
||||
|
||||
type LdapConfig struct {
|
||||
URL string `json:"url"`
|
||||
UserBase string `json:"user_base"`
|
||||
SearchDN string `json:"search_dn"`
|
||||
UserBind string `json:"user_bind"`
|
||||
UserFilter string `json:"user_filter"`
|
||||
UserAttr string `json:"username_attr"`
|
||||
SyncInterval string `json:"sync_interval"` // Parsed using time.ParseDuration.
|
||||
SyncDelOldUsers bool `json:"sync_del_old_users"`
|
||||
|
||||
// Should an non-existent user be added to the DB if user exists in ldap directory
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
}
|
||||
|
||||
type LdapAuthenticator struct {
|
||||
syncPassword string
|
||||
UserAttr string
|
||||
@@ -29,40 +42,11 @@ var _ Authenticator = (*LdapAuthenticator)(nil)
|
||||
func (la *LdapAuthenticator) Init() error {
|
||||
la.syncPassword = os.Getenv("LDAP_ADMIN_PASSWORD")
|
||||
if la.syncPassword == "" {
|
||||
log.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
|
||||
cclog.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
|
||||
}
|
||||
|
||||
lc := config.Keys.LdapConfig
|
||||
|
||||
if lc.SyncInterval != "" {
|
||||
interval, err := time.ParseDuration(lc.SyncInterval)
|
||||
if err != nil {
|
||||
log.Warnf("Could not parse duration for sync interval: %v",
|
||||
lc.SyncInterval)
|
||||
return err
|
||||
}
|
||||
|
||||
if interval == 0 {
|
||||
log.Info("Sync interval is zero")
|
||||
return nil
|
||||
}
|
||||
|
||||
go func() {
|
||||
ticker := time.NewTicker(interval)
|
||||
for t := range ticker.C {
|
||||
log.Printf("sync started at %s", t.Format(time.RFC3339))
|
||||
if err := la.Sync(); err != nil {
|
||||
log.Errorf("sync failed: %s", err.Error())
|
||||
}
|
||||
log.Print("sync done")
|
||||
}
|
||||
}()
|
||||
} else {
|
||||
log.Info("LDAP configuration key sync_interval invalid")
|
||||
}
|
||||
|
||||
if lc.UserAttr != "" {
|
||||
la.UserAttr = lc.UserAttr
|
||||
if Keys.LdapConfig.UserAttr != "" {
|
||||
la.UserAttr = Keys.LdapConfig.UserAttr
|
||||
} else {
|
||||
la.UserAttr = "gecos"
|
||||
}
|
||||
@@ -76,7 +60,7 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request,
|
||||
) (*schema.User, bool) {
|
||||
lc := config.Keys.LdapConfig
|
||||
lc := Keys.LdapConfig
|
||||
|
||||
if user != nil {
|
||||
if user.AuthSource == schema.AuthViaLDAP {
|
||||
@@ -86,7 +70,8 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
if lc.SyncUserOnLogin {
|
||||
l, err := la.getLdapConnection(true)
|
||||
if err != nil {
|
||||
log.Error("LDAP connection error")
|
||||
cclog.Error("LDAP connection error")
|
||||
return nil, false
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
@@ -99,12 +84,12 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
|
||||
sr, err := l.Search(searchRequest)
|
||||
if err != nil {
|
||||
log.Warn(err)
|
||||
cclog.Warn(err)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
if len(sr.Entries) != 1 {
|
||||
log.Warn("LDAP: User does not exist or too many entries returned")
|
||||
cclog.Warn("LDAP: User does not exist or too many entries returned")
|
||||
return nil, false
|
||||
}
|
||||
|
||||
@@ -124,7 +109,7 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
}
|
||||
|
||||
if err := repository.GetUserRepository().AddUser(user); err != nil {
|
||||
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
@@ -142,14 +127,14 @@ func (la *LdapAuthenticator) Login(
|
||||
) (*schema.User, error) {
|
||||
l, err := la.getLdapConnection(false)
|
||||
if err != nil {
|
||||
log.Warn("Error while getting ldap connection")
|
||||
cclog.Warn("Error while getting ldap connection")
|
||||
return nil, err
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
userDn := strings.Replace(config.Keys.LdapConfig.UserBind, "{username}", user.Username, -1)
|
||||
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username)
|
||||
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
|
||||
log.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
|
||||
cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
|
||||
user.Username, err)
|
||||
return nil, fmt.Errorf("Authentication failed")
|
||||
}
|
||||
@@ -158,11 +143,11 @@ func (la *LdapAuthenticator) Login(
|
||||
}
|
||||
|
||||
func (la *LdapAuthenticator) Sync() error {
|
||||
const IN_DB int = 1
|
||||
const IN_LDAP int = 2
|
||||
const IN_BOTH int = 3
|
||||
const InDB int = 1
|
||||
const InLdap int = 2
|
||||
const InBoth int = 3
|
||||
ur := repository.GetUserRepository()
|
||||
lc := config.Keys.LdapConfig
|
||||
lc := Keys.LdapConfig
|
||||
|
||||
users := map[string]int{}
|
||||
usernames, err := ur.GetLdapUsernames()
|
||||
@@ -171,12 +156,12 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
}
|
||||
|
||||
for _, username := range usernames {
|
||||
users[username] = IN_DB
|
||||
users[username] = InDB
|
||||
}
|
||||
|
||||
l, err := la.getLdapConnection(true)
|
||||
if err != nil {
|
||||
log.Error("LDAP connection error")
|
||||
cclog.Error("LDAP connection error")
|
||||
return err
|
||||
}
|
||||
defer l.Close()
|
||||
@@ -187,7 +172,7 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
lc.UserFilter,
|
||||
[]string{"dn", "uid", la.UserAttr}, nil))
|
||||
if err != nil {
|
||||
log.Warn("LDAP search error")
|
||||
cclog.Warn("LDAP search error")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -200,18 +185,18 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
|
||||
_, ok := users[username]
|
||||
if !ok {
|
||||
users[username] = IN_LDAP
|
||||
users[username] = InLdap
|
||||
newnames[username] = entry.GetAttributeValue(la.UserAttr)
|
||||
} else {
|
||||
users[username] = IN_BOTH
|
||||
users[username] = InBoth
|
||||
}
|
||||
}
|
||||
|
||||
for username, where := range users {
|
||||
if where == IN_DB && lc.SyncDelOldUsers {
|
||||
if where == InDB && lc.SyncDelOldUsers {
|
||||
ur.DelUser(username)
|
||||
log.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
|
||||
} else if where == IN_LDAP {
|
||||
cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
|
||||
} else if where == InLdap {
|
||||
name := newnames[username]
|
||||
|
||||
var roles []string
|
||||
@@ -226,9 +211,9 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
AuthSource: schema.AuthViaLDAP,
|
||||
}
|
||||
|
||||
log.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
|
||||
cclog.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
|
||||
if err := ur.AddUser(user); err != nil {
|
||||
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
return err
|
||||
}
|
||||
}
|
||||
@@ -238,17 +223,17 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
}
|
||||
|
||||
func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
|
||||
lc := config.Keys.LdapConfig
|
||||
conn, err := ldap.DialURL(lc.Url)
|
||||
lc := Keys.LdapConfig
|
||||
conn, err := ldap.DialURL(lc.URL)
|
||||
if err != nil {
|
||||
log.Warn("LDAP URL dial failed")
|
||||
cclog.Warn("LDAP URL dial failed")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if admin {
|
||||
if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {
|
||||
conn.Close()
|
||||
log.Warn("LDAP connection bind failed")
|
||||
cclog.Warn("LDAP connection bind failed")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"golang.org/x/crypto/bcrypt"
|
||||
)
|
||||
|
||||
@@ -27,19 +28,19 @@ func (la *LocalAuthenticator) CanLogin(
|
||||
user *schema.User,
|
||||
username string,
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request) (*schema.User, bool) {
|
||||
|
||||
r *http.Request,
|
||||
) (*schema.User, bool) {
|
||||
return user, user != nil && user.AuthSource == schema.AuthViaLocalPassword
|
||||
}
|
||||
|
||||
func (la *LocalAuthenticator) Login(
|
||||
user *schema.User,
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request) (*schema.User, error) {
|
||||
|
||||
r *http.Request,
|
||||
) (*schema.User, error) {
|
||||
if e := bcrypt.CompareHashAndPassword([]byte(user.Password),
|
||||
[]byte(r.FormValue("password"))); e != nil {
|
||||
log.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
|
||||
cclog.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
|
||||
return nil, fmt.Errorf("Authentication failed")
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
@@ -13,15 +14,20 @@ import (
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/coreos/go-oidc/v3/oidc"
|
||||
"github.com/gorilla/mux"
|
||||
"golang.org/x/oauth2"
|
||||
)
|
||||
|
||||
type OpenIDConfig struct {
|
||||
Provider string `json:"provider"`
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
}
|
||||
|
||||
type OIDC struct {
|
||||
client *oauth2.Config
|
||||
provider *oidc.Provider
|
||||
@@ -48,18 +54,23 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin
|
||||
http.SetCookie(w, c)
|
||||
}
|
||||
|
||||
// NewOIDC creates a new OIDC authenticator with the configured provider
|
||||
func NewOIDC(a *Authentication) *OIDC {
|
||||
provider, err := oidc.NewProvider(context.Background(), config.Keys.OpenIDConfig.Provider)
|
||||
// Use context with timeout for provider initialization
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
provider, err := oidc.NewProvider(ctx, Keys.OpenIDConfig.Provider)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
clientID := os.Getenv("OID_CLIENT_ID")
|
||||
if clientID == "" {
|
||||
log.Warn("environment variable 'OID_CLIENT_ID' not set (Open ID connect auth will not work)")
|
||||
cclog.Warn("environment variable 'OID_CLIENT_ID' not set (Open ID connect auth will not work)")
|
||||
}
|
||||
clientSecret := os.Getenv("OID_CLIENT_SECRET")
|
||||
if clientSecret == "" {
|
||||
log.Warn("environment variable 'OID_CLIENT_SECRET' not set (Open ID connect auth will not work)")
|
||||
cclog.Warn("environment variable 'OID_CLIENT_SECRET' not set (Open ID connect auth will not work)")
|
||||
}
|
||||
|
||||
client := &oauth2.Config{
|
||||
@@ -105,13 +116,18 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
||||
http.Error(rw, "Code not found", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
token, err := oa.client.Exchange(context.Background(), code, oauth2.VerifierOption(codeVerifier))
|
||||
// Exchange authorization code for token with timeout
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier))
|
||||
if err != nil {
|
||||
http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
userInfo, err := oa.provider.UserInfo(context.Background(), oauth2.StaticTokenSource(token))
|
||||
// Get user info from OIDC provider with same timeout
|
||||
userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token))
|
||||
if err != nil {
|
||||
http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
@@ -168,14 +184,14 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
||||
AuthSource: schema.AuthViaOIDC,
|
||||
}
|
||||
|
||||
if config.Keys.OpenIDConfig.SyncUserOnLogin {
|
||||
persistUser(user)
|
||||
if Keys.OpenIDConfig.SyncUserOnLogin || Keys.OpenIDConfig.UpdateUserOnLogin {
|
||||
handleOIDCUser(user)
|
||||
}
|
||||
|
||||
oa.authentication.SaveSession(rw, r, user)
|
||||
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(ctx))
|
||||
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx))
|
||||
}
|
||||
|
||||
func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
|
||||
|
||||
96
internal/auth/schema.go
Normal file
96
internal/auth/schema.go
Normal file
@@ -0,0 +1,96 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
var configSchema = `
|
||||
{
|
||||
"jwts": {
|
||||
"description": "For JWT token authentication.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"max-age": {
|
||||
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
|
||||
"type": "string"
|
||||
},
|
||||
"cookieName": {
|
||||
"description": "Cookie that should be checked for a JWT token.",
|
||||
"type": "string"
|
||||
},
|
||||
"validateUser": {
|
||||
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"trustedIssuer": {
|
||||
"description": "Issuer that should be accepted when validating external JWTs ",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["max-age"]
|
||||
},
|
||||
"oidc": {
|
||||
"provider": {
|
||||
"description": "",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
},
|
||||
"updateUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
},
|
||||
"required": ["provider"]
|
||||
},
|
||||
"ldap": {
|
||||
"description": "For LDAP Authentication and user synchronisation.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"description": "URL of LDAP directory server.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_base": {
|
||||
"description": "Base DN of user tree root.",
|
||||
"type": "string"
|
||||
},
|
||||
"search_dn": {
|
||||
"description": "DN for authenticating LDAP admin account with general read rights.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_bind": {
|
||||
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_filter": {
|
||||
"description": "Filter to extract users for syncing.",
|
||||
"type": "string"
|
||||
},
|
||||
"username_attr": {
|
||||
"description": "Attribute with full username. Default: gecos",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_interval": {
|
||||
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_del_old_users": {
|
||||
"description": "Delete obsolete users in database.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["url", "user_base", "search_dn", "user_bind", "user_filter"]
|
||||
},
|
||||
"required": ["jwts"]
|
||||
}`
|
||||
@@ -1,73 +1,160 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package config implements the program configuration data structures, validation and parsing
|
||||
package config
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
)
|
||||
|
||||
var Keys schema.ProgramConfig = schema.ProgramConfig{
|
||||
type ProgramConfig struct {
|
||||
// Address where the http (or https) server will listen on (for example: 'localhost:80').
|
||||
Addr string `json:"addr"`
|
||||
|
||||
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
|
||||
APIAllowedIPs []string `json:"apiAllowedIPs"`
|
||||
|
||||
APISubjects *NATSConfig `json:"apiSubjects"`
|
||||
|
||||
// Drop root permissions once .env was read and the port was taken.
|
||||
User string `json:"user"`
|
||||
Group string `json:"group"`
|
||||
|
||||
// Disable authentication (for everything: API, Web-UI, ...)
|
||||
DisableAuthentication bool `json:"disable-authentication"`
|
||||
|
||||
// If `embed-static-files` is true (default), the frontend files are directly
|
||||
// embeded into the go binary and expected to be in web/frontend. Only if
|
||||
// it is false the files in `static-files` are served instead.
|
||||
EmbedStaticFiles bool `json:"embed-static-files"`
|
||||
StaticFiles string `json:"static-files"`
|
||||
|
||||
// Database driver - only 'sqlite3' is supported
|
||||
DBDriver string `json:"db-driver"`
|
||||
|
||||
// Path to SQLite database file
|
||||
DB string `json:"db"`
|
||||
|
||||
// Keep all metric data in the metric data repositories,
|
||||
// do not write to the job-archive.
|
||||
DisableArchive bool `json:"disable-archive"`
|
||||
|
||||
EnableJobTaggers bool `json:"enable-job-taggers"`
|
||||
|
||||
// Validate json input against schema
|
||||
Validate bool `json:"validate"`
|
||||
|
||||
// If 0 or empty, the session does not expire!
|
||||
SessionMaxAge string `json:"session-max-age"`
|
||||
|
||||
// If both those options are not empty, use HTTPS using those certificates.
|
||||
HTTPSCertFile string `json:"https-cert-file"`
|
||||
HTTPSKeyFile string `json:"https-key-file"`
|
||||
|
||||
// If not the empty string and `addr` does not end in ":80",
|
||||
// redirect every request incoming at port 80 to that url.
|
||||
RedirectHTTPTo string `json:"redirect-http-to"`
|
||||
|
||||
// Where to store MachineState files
|
||||
MachineStateDir string `json:"machine-state-dir"`
|
||||
|
||||
// If not zero, automatically mark jobs as stopped running X seconds longer than their walltime.
|
||||
StopJobsExceedingWalltime int `json:"stop-jobs-exceeding-walltime"`
|
||||
|
||||
// Defines time X in seconds in which jobs are considered to be "short" and will be filtered in specific views.
|
||||
ShortRunningJobsDuration int `json:"short-running-jobs-duration"`
|
||||
|
||||
// Energy Mix CO2 Emission Constant [g/kWh]
|
||||
// If entered, displays estimated CO2 emission for job based on jobs totalEnergy
|
||||
EmissionConstant int `json:"emission-constant"`
|
||||
|
||||
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
|
||||
EnableResampling *ResampleConfig `json:"resampling"`
|
||||
|
||||
// Global upstream metric repository configuration for metric pull workers
|
||||
UpstreamMetricRepository *json.RawMessage `json:"upstreamMetricRepository,omitempty"`
|
||||
}
|
||||
|
||||
type ResampleConfig struct {
|
||||
// Minimum number of points to trigger resampling of data
|
||||
MinimumPoints int `json:"minimumPoints"`
|
||||
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
|
||||
Resolutions []int `json:"resolutions"`
|
||||
// Trigger next zoom level at less than this many visible datapoints
|
||||
Trigger int `json:"trigger"`
|
||||
}
|
||||
|
||||
type NATSConfig struct {
|
||||
SubjectJobStart string `json:"subjectJobStart"`
|
||||
SubjectJobStop string `json:"subjectJobStop"`
|
||||
SubjectNodeState string `json:"subjectNodeState"`
|
||||
}
|
||||
|
||||
type IntRange struct {
|
||||
From int `json:"from"`
|
||||
To int `json:"to"`
|
||||
}
|
||||
|
||||
type TimeRange struct {
|
||||
From *time.Time `json:"from"`
|
||||
To *time.Time `json:"to"`
|
||||
Range string `json:"range,omitempty"`
|
||||
}
|
||||
|
||||
type FilterRanges struct {
|
||||
Duration *IntRange `json:"duration"`
|
||||
NumNodes *IntRange `json:"numNodes"`
|
||||
StartTime *TimeRange `json:"startTime"`
|
||||
}
|
||||
|
||||
type ClusterConfig struct {
|
||||
Name string `json:"name"`
|
||||
FilterRanges *FilterRanges `json:"filterRanges"`
|
||||
}
|
||||
|
||||
var Clusters []*ClusterConfig
|
||||
|
||||
var Keys ProgramConfig = ProgramConfig{
|
||||
Addr: "localhost:8080",
|
||||
DisableAuthentication: false,
|
||||
EmbedStaticFiles: true,
|
||||
DBDriver: "sqlite3",
|
||||
DB: "./var/job.db",
|
||||
Archive: json.RawMessage(`{\"kind\":\"file\",\"path\":\"./var/job-archive\"}`),
|
||||
DisableArchive: false,
|
||||
Validate: false,
|
||||
SessionMaxAge: "168h",
|
||||
StopJobsExceedingWalltime: 0,
|
||||
ShortRunningJobsDuration: 5 * 60,
|
||||
UiDefaults: map[string]interface{}{
|
||||
"analysis_view_histogramMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"analysis_view_scatterPlotMetrics": [][]string{{"flops_any", "mem_bw"}, {"flops_any", "cpu_load"}, {"cpu_load", "mem_bw"}},
|
||||
"job_view_nodestats_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"job_view_polarPlotMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"job_view_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"job_view_showFootprint": true,
|
||||
"job_list_usePaging": true,
|
||||
"plot_general_colorBackground": true,
|
||||
"plot_general_colorscheme": []string{"#00bfff", "#0000ff", "#ff00ff", "#ff0000", "#ff8000", "#ffff00", "#80ff00"},
|
||||
"plot_general_lineWidth": 3,
|
||||
"plot_list_jobsPerPage": 50,
|
||||
"plot_list_selectedMetrics": []string{"cpu_load", "mem_used", "flops_any", "mem_bw"},
|
||||
"plot_view_plotsPerRow": 3,
|
||||
"plot_view_showPolarplot": true,
|
||||
"plot_view_showRoofline": true,
|
||||
"plot_view_showStatTable": true,
|
||||
"system_view_selectedMetric": "cpu_load",
|
||||
"analysis_view_selectedTopEntity": "user",
|
||||
"analysis_view_selectedTopCategory": "totalWalltime",
|
||||
"status_view_selectedTopUserCategory": "totalJobs",
|
||||
"status_view_selectedTopProjectCategory": "totalJobs",
|
||||
},
|
||||
}
|
||||
|
||||
func Init(flagConfigFile string) {
|
||||
raw, err := os.ReadFile(flagConfigFile)
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
log.Fatalf("CONFIG ERROR: %v", err)
|
||||
}
|
||||
} else {
|
||||
if err := schema.Validate(schema.Config, bytes.NewReader(raw)); err != nil {
|
||||
log.Fatalf("Validate config: %v\n", err)
|
||||
}
|
||||
dec := json.NewDecoder(bytes.NewReader(raw))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
log.Fatalf("could not decode: %v", err)
|
||||
}
|
||||
func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
|
||||
Validate(configSchema, mainConfig)
|
||||
dec := json.NewDecoder(bytes.NewReader(mainConfig))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
|
||||
}
|
||||
|
||||
if Keys.Clusters == nil || len(Keys.Clusters) < 1 {
|
||||
log.Fatal("At least one cluster required in config!")
|
||||
}
|
||||
Validate(clustersSchema, clusterConfig)
|
||||
dec = json.NewDecoder(bytes.NewReader(clusterConfig))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Clusters); err != nil {
|
||||
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
|
||||
}
|
||||
|
||||
if len(Clusters) < 1 {
|
||||
cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
|
||||
}
|
||||
|
||||
if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
|
||||
resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,30 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package config
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func TestInit(t *testing.T) {
|
||||
fp := "../../configs/config.json"
|
||||
Init(fp)
|
||||
ccconf.Init(fp)
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
|
||||
if Keys.Addr != "0.0.0.0:443" {
|
||||
t.Errorf("wrong addr\ngot: %s \nwant: 0.0.0.0:443", Keys.Addr)
|
||||
}
|
||||
@@ -18,7 +32,17 @@ func TestInit(t *testing.T) {
|
||||
|
||||
func TestInitMinimal(t *testing.T) {
|
||||
fp := "../../configs/config-demo.json"
|
||||
Init(fp)
|
||||
ccconf.Init(fp)
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
|
||||
if Keys.Addr != "127.0.0.1:8080" {
|
||||
t.Errorf("wrong addr\ngot: %s \nwant: 127.0.0.1:8080", Keys.Addr)
|
||||
}
|
||||
|
||||
51
internal/config/default_metrics.go
Normal file
51
internal/config/default_metrics.go
Normal file
@@ -0,0 +1,51 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package config
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DEPRECATED: SUPERSEDED BY NEW USER CONFIG - userConfig.go / web.go
|
||||
|
||||
type DefaultMetricsCluster struct {
|
||||
Name string `json:"name"`
|
||||
DefaultMetrics string `json:"default_metrics"`
|
||||
}
|
||||
|
||||
type DefaultMetricsConfig struct {
|
||||
Clusters []DefaultMetricsCluster `json:"clusters"`
|
||||
}
|
||||
|
||||
func LoadDefaultMetricsConfig() (*DefaultMetricsConfig, error) {
|
||||
filePath := "default_metrics.json"
|
||||
if _, err := os.Stat(filePath); os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
data, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var cfg DefaultMetricsConfig
|
||||
if err := json.Unmarshal(data, &cfg); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &cfg, nil
|
||||
}
|
||||
|
||||
func ParseMetricsString(s string) []string {
|
||||
parts := strings.Split(s, ",")
|
||||
var metrics []string
|
||||
for _, p := range parts {
|
||||
trimmed := strings.TrimSpace(p)
|
||||
if trimmed != "" {
|
||||
metrics = append(metrics, trimmed)
|
||||
}
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
222
internal/config/schema.go
Normal file
222
internal/config/schema.go
Normal file
@@ -0,0 +1,222 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package config
|
||||
|
||||
var configSchema = `
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"addr": {
|
||||
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
|
||||
"type": "string"
|
||||
},
|
||||
"apiAllowedIPs": {
|
||||
"description": "Addresses from which secured API endpoints can be reached",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"user": {
|
||||
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||
"type": "string"
|
||||
},
|
||||
"group": {
|
||||
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||
"type": "string"
|
||||
},
|
||||
"disable-authentication": {
|
||||
"description": "Disable authentication (for everything: API, Web-UI, ...).",
|
||||
"type": "boolean"
|
||||
},
|
||||
"embed-static-files": {
|
||||
"description": "If all files in web/frontend/public should be served from within the binary itself (they are embedded) or not.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"static-files": {
|
||||
"description": "Folder where static assets can be found, if embed-static-files is false.",
|
||||
"type": "string"
|
||||
},
|
||||
"db": {
|
||||
"description": "Path to SQLite database file (e.g., './var/job.db')",
|
||||
"type": "string"
|
||||
},
|
||||
"disable-archive": {
|
||||
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"enable-job-taggers": {
|
||||
"description": "Turn on automatic application and jobclass taggers",
|
||||
"type": "boolean"
|
||||
},
|
||||
"validate": {
|
||||
"description": "Validate all input json documents against json schema.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"session-max-age": {
|
||||
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
|
||||
"type": "string"
|
||||
},
|
||||
"https-cert-file": {
|
||||
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
|
||||
"type": "string"
|
||||
},
|
||||
"https-key-file": {
|
||||
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
|
||||
"type": "string"
|
||||
},
|
||||
"redirect-http-to": {
|
||||
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
|
||||
"type": "string"
|
||||
},
|
||||
"stop-jobs-exceeding-walltime": {
|
||||
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
|
||||
"type": "integer"
|
||||
},
|
||||
"short-running-jobs-duration": {
|
||||
"description": "Do not show running jobs shorter than X seconds.",
|
||||
"type": "integer"
|
||||
},
|
||||
"emission-constant": {
|
||||
"description": ".",
|
||||
"type": "integer"
|
||||
},
|
||||
"cron-frequency": {
|
||||
"description": "Frequency of cron job workers.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"duration-worker": {
|
||||
"description": "Duration Update Worker [Defaults to '5m']",
|
||||
"type": "string"
|
||||
},
|
||||
"footprint-worker": {
|
||||
"description": "Metric-Footprint Update Worker [Defaults to '10m']",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"enable-resampling": {
|
||||
"description": "Enable dynamic zoom in frontend metric plots.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"minimumPoints": {
|
||||
"description": "Minimum points to trigger resampling of time-series data.",
|
||||
"type": "integer"
|
||||
},
|
||||
"trigger": {
|
||||
"description": "Trigger next zoom level at less than this many visible datapoints.",
|
||||
"type": "integer"
|
||||
},
|
||||
"resolutions": {
|
||||
"description": "Array of resampling target resolutions, in seconds.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["trigger", "resolutions"]
|
||||
},
|
||||
"upstreamMetricRepository": {
|
||||
"description": "Global upstream metric repository configuration for metric pull workers",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
|
||||
},
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"token": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["kind"]
|
||||
}
|
||||
},
|
||||
"required": ["apiAllowedIPs"]
|
||||
}`
|
||||
|
||||
var clustersSchema = `
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "The name of the cluster.",
|
||||
"type": "string"
|
||||
},
|
||||
"metricDataRepository": {
|
||||
"description": "Type of the metric data repository for this cluster",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
|
||||
},
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"token": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["kind"]
|
||||
},
|
||||
"filterRanges": {
|
||||
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"numNodes": {
|
||||
"description": "UI slider range for number of nodes",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
},
|
||||
"duration": {
|
||||
"description": "UI slider range for duration",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
},
|
||||
"startTime": {
|
||||
"description": "UI slider range for start time",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"to": {
|
||||
"type": "null"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
}
|
||||
},
|
||||
"required": ["numNodes", "duration", "startTime"]
|
||||
}
|
||||
},
|
||||
"required": ["name", "filterRanges"],
|
||||
"minItems": 1
|
||||
}
|
||||
}`
|
||||
29
internal/config/validate.go
Normal file
29
internal/config/validate.go
Normal file
@@ -0,0 +1,29 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package config
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/santhosh-tekuri/jsonschema/v5"
|
||||
)
|
||||
|
||||
func Validate(schema string, instance json.RawMessage) {
|
||||
sch, err := jsonschema.CompileString("schema.json", schema)
|
||||
if err != nil {
|
||||
cclog.Fatalf("%#v", err)
|
||||
}
|
||||
|
||||
var v any
|
||||
if err := json.Unmarshal([]byte(instance), &v); err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
|
||||
if err = sch.Validate(v); err != nil {
|
||||
cclog.Fatalf("%#v", err)
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,6 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package model
|
||||
|
||||
@@ -3,24 +3,50 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
type ClusterMetricWithName struct {
|
||||
Name string `json:"name"`
|
||||
Unit *schema.Unit `json:"unit,omitempty"`
|
||||
Timestep int `json:"timestep"`
|
||||
Data []schema.Float `json:"data"`
|
||||
}
|
||||
|
||||
type ClusterMetrics struct {
|
||||
NodeCount int `json:"nodeCount"`
|
||||
Metrics []*ClusterMetricWithName `json:"metrics"`
|
||||
}
|
||||
|
||||
type Count struct {
|
||||
Name string `json:"name"`
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
type EnergyFootprintValue struct {
|
||||
Hardware string `json:"hardware"`
|
||||
Metric string `json:"metric"`
|
||||
Value float64 `json:"value"`
|
||||
}
|
||||
|
||||
type FloatRange struct {
|
||||
From float64 `json:"from"`
|
||||
To float64 `json:"to"`
|
||||
}
|
||||
|
||||
type FootprintValue struct {
|
||||
Name string `json:"name"`
|
||||
Stat string `json:"stat"`
|
||||
Value float64 `json:"value"`
|
||||
}
|
||||
|
||||
type Footprints struct {
|
||||
TimeWeights *TimeWeights `json:"timeWeights"`
|
||||
Metrics []*MetricFootprints `json:"metrics"`
|
||||
@@ -38,6 +64,7 @@ type IntRangeOutput struct {
|
||||
|
||||
type JobFilter struct {
|
||||
Tags []string `json:"tags,omitempty"`
|
||||
DbID []string `json:"dbId,omitempty"`
|
||||
JobID *StringInput `json:"jobId,omitempty"`
|
||||
ArrayJobID *int `json:"arrayJobId,omitempty"`
|
||||
User *StringInput `json:"user,omitempty"`
|
||||
@@ -45,18 +72,16 @@ type JobFilter struct {
|
||||
JobName *StringInput `json:"jobName,omitempty"`
|
||||
Cluster *StringInput `json:"cluster,omitempty"`
|
||||
Partition *StringInput `json:"partition,omitempty"`
|
||||
Duration *schema.IntRange `json:"duration,omitempty"`
|
||||
Duration *config.IntRange `json:"duration,omitempty"`
|
||||
Energy *FloatRange `json:"energy,omitempty"`
|
||||
MinRunningFor *int `json:"minRunningFor,omitempty"`
|
||||
NumNodes *schema.IntRange `json:"numNodes,omitempty"`
|
||||
NumAccelerators *schema.IntRange `json:"numAccelerators,omitempty"`
|
||||
NumHWThreads *schema.IntRange `json:"numHWThreads,omitempty"`
|
||||
StartTime *schema.TimeRange `json:"startTime,omitempty"`
|
||||
NumNodes *config.IntRange `json:"numNodes,omitempty"`
|
||||
NumAccelerators *config.IntRange `json:"numAccelerators,omitempty"`
|
||||
NumHWThreads *config.IntRange `json:"numHWThreads,omitempty"`
|
||||
StartTime *config.TimeRange `json:"startTime,omitempty"`
|
||||
State []schema.JobState `json:"state,omitempty"`
|
||||
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg,omitempty"`
|
||||
MemBwAvg *FloatRange `json:"memBwAvg,omitempty"`
|
||||
LoadAvg *FloatRange `json:"loadAvg,omitempty"`
|
||||
MemUsedMax *FloatRange `json:"memUsedMax,omitempty"`
|
||||
Exclusive *int `json:"exclusive,omitempty"`
|
||||
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
|
||||
Shared *string `json:"shared,omitempty"`
|
||||
Node *StringInput `json:"node,omitempty"`
|
||||
}
|
||||
|
||||
@@ -85,9 +110,23 @@ type JobResultList struct {
|
||||
HasNextPage *bool `json:"hasNextPage,omitempty"`
|
||||
}
|
||||
|
||||
type JobStats struct {
|
||||
ID int `json:"id"`
|
||||
JobID string `json:"jobId"`
|
||||
StartTime int `json:"startTime"`
|
||||
Duration int `json:"duration"`
|
||||
Cluster string `json:"cluster"`
|
||||
SubCluster string `json:"subCluster"`
|
||||
NumNodes int `json:"numNodes"`
|
||||
NumHWThreads *int `json:"numHWThreads,omitempty"`
|
||||
NumAccelerators *int `json:"numAccelerators,omitempty"`
|
||||
Stats []*NamedStats `json:"stats"`
|
||||
}
|
||||
|
||||
type JobsStatistics struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
TotalUsers int `json:"totalUsers"`
|
||||
TotalJobs int `json:"totalJobs"`
|
||||
RunningJobs int `json:"runningJobs"`
|
||||
ShortJobs int `json:"shortJobs"`
|
||||
@@ -120,20 +159,73 @@ type MetricHistoPoint struct {
|
||||
type MetricHistoPoints struct {
|
||||
Metric string `json:"metric"`
|
||||
Unit string `json:"unit"`
|
||||
Stat *string `json:"stat,omitempty"`
|
||||
Data []*MetricHistoPoint `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
type MetricStatItem struct {
|
||||
MetricName string `json:"metricName"`
|
||||
Range *FloatRange `json:"range"`
|
||||
}
|
||||
|
||||
type Mutation struct {
|
||||
}
|
||||
|
||||
type NamedStats struct {
|
||||
Name string `json:"name"`
|
||||
Data *schema.MetricStatistics `json:"data"`
|
||||
}
|
||||
|
||||
type NamedStatsWithScope struct {
|
||||
Name string `json:"name"`
|
||||
Scope schema.MetricScope `json:"scope"`
|
||||
Stats []*ScopedStats `json:"stats"`
|
||||
}
|
||||
|
||||
type NodeFilter struct {
|
||||
Hostname *StringInput `json:"hostname,omitempty"`
|
||||
Cluster *StringInput `json:"cluster,omitempty"`
|
||||
Subcluster *StringInput `json:"subcluster,omitempty"`
|
||||
SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"`
|
||||
HealthState *string `json:"healthState,omitempty"`
|
||||
TimeStart *int `json:"timeStart,omitempty"`
|
||||
}
|
||||
|
||||
type NodeMetrics struct {
|
||||
Host string `json:"host"`
|
||||
State string `json:"state"`
|
||||
SubCluster string `json:"subCluster"`
|
||||
Metrics []*JobMetricWithName `json:"metrics"`
|
||||
}
|
||||
|
||||
type NodeStateResultList struct {
|
||||
Items []*schema.Node `json:"items"`
|
||||
Count *int `json:"count,omitempty"`
|
||||
}
|
||||
|
||||
type NodeStates struct {
|
||||
State string `json:"state"`
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
type NodeStatesTimed struct {
|
||||
State string `json:"state"`
|
||||
Counts []int `json:"counts"`
|
||||
Times []int `json:"times"`
|
||||
}
|
||||
|
||||
type NodesResultList struct {
|
||||
Items []*NodeMetrics `json:"items"`
|
||||
Offset *int `json:"offset,omitempty"`
|
||||
Limit *int `json:"limit,omitempty"`
|
||||
Count *int `json:"count,omitempty"`
|
||||
TotalNodes *int `json:"totalNodes,omitempty"`
|
||||
HasNextPage *bool `json:"hasNextPage,omitempty"`
|
||||
}
|
||||
|
||||
type OrderByInput struct {
|
||||
Field string `json:"field"`
|
||||
Type string `json:"type"`
|
||||
Order SortDirectionEnum `json:"order"`
|
||||
}
|
||||
|
||||
@@ -142,7 +234,10 @@ type PageRequest struct {
|
||||
Page int `json:"page"`
|
||||
}
|
||||
|
||||
type Query struct {
|
||||
type ScopedStats struct {
|
||||
Hostname string `json:"hostname"`
|
||||
ID *string `json:"id,omitempty"`
|
||||
Data *schema.MetricStatistics `json:"data"`
|
||||
}
|
||||
|
||||
type StringInput struct {
|
||||
@@ -155,8 +250,9 @@ type StringInput struct {
|
||||
}
|
||||
|
||||
type TimeRangeOutput struct {
|
||||
From time.Time `json:"from"`
|
||||
To time.Time `json:"to"`
|
||||
Range *string `json:"range,omitempty"`
|
||||
From time.Time `json:"from"`
|
||||
To time.Time `json:"to"`
|
||||
}
|
||||
|
||||
type TimeWeights struct {
|
||||
@@ -174,20 +270,22 @@ type User struct {
|
||||
type Aggregate string
|
||||
|
||||
const (
|
||||
AggregateUser Aggregate = "USER"
|
||||
AggregateProject Aggregate = "PROJECT"
|
||||
AggregateCluster Aggregate = "CLUSTER"
|
||||
AggregateUser Aggregate = "USER"
|
||||
AggregateProject Aggregate = "PROJECT"
|
||||
AggregateCluster Aggregate = "CLUSTER"
|
||||
AggregateSubcluster Aggregate = "SUBCLUSTER"
|
||||
)
|
||||
|
||||
var AllAggregate = []Aggregate{
|
||||
AggregateUser,
|
||||
AggregateProject,
|
||||
AggregateCluster,
|
||||
AggregateSubcluster,
|
||||
}
|
||||
|
||||
func (e Aggregate) IsValid() bool {
|
||||
switch e {
|
||||
case AggregateUser, AggregateProject, AggregateCluster:
|
||||
case AggregateUser, AggregateProject, AggregateCluster, AggregateSubcluster:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
@@ -197,7 +295,7 @@ func (e Aggregate) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *Aggregate) UnmarshalGQL(v interface{}) error {
|
||||
func (e *Aggregate) UnmarshalGQL(v any) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
@@ -214,11 +312,26 @@ func (e Aggregate) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
func (e *Aggregate) UnmarshalJSON(b []byte) error {
|
||||
s, err := strconv.Unquote(string(b))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return e.UnmarshalGQL(s)
|
||||
}
|
||||
|
||||
func (e Aggregate) MarshalJSON() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
e.MarshalGQL(&buf)
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
type SortByAggregate string
|
||||
|
||||
const (
|
||||
SortByAggregateTotalwalltime SortByAggregate = "TOTALWALLTIME"
|
||||
SortByAggregateTotaljobs SortByAggregate = "TOTALJOBS"
|
||||
SortByAggregateTotalusers SortByAggregate = "TOTALUSERS"
|
||||
SortByAggregateTotalnodes SortByAggregate = "TOTALNODES"
|
||||
SortByAggregateTotalnodehours SortByAggregate = "TOTALNODEHOURS"
|
||||
SortByAggregateTotalcores SortByAggregate = "TOTALCORES"
|
||||
@@ -230,6 +343,7 @@ const (
|
||||
var AllSortByAggregate = []SortByAggregate{
|
||||
SortByAggregateTotalwalltime,
|
||||
SortByAggregateTotaljobs,
|
||||
SortByAggregateTotalusers,
|
||||
SortByAggregateTotalnodes,
|
||||
SortByAggregateTotalnodehours,
|
||||
SortByAggregateTotalcores,
|
||||
@@ -240,7 +354,7 @@ var AllSortByAggregate = []SortByAggregate{
|
||||
|
||||
func (e SortByAggregate) IsValid() bool {
|
||||
switch e {
|
||||
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
|
||||
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalusers, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
@@ -250,7 +364,7 @@ func (e SortByAggregate) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *SortByAggregate) UnmarshalGQL(v interface{}) error {
|
||||
func (e *SortByAggregate) UnmarshalGQL(v any) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
@@ -267,6 +381,20 @@ func (e SortByAggregate) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
func (e *SortByAggregate) UnmarshalJSON(b []byte) error {
|
||||
s, err := strconv.Unquote(string(b))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return e.UnmarshalGQL(s)
|
||||
}
|
||||
|
||||
func (e SortByAggregate) MarshalJSON() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
e.MarshalGQL(&buf)
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
type SortDirectionEnum string
|
||||
|
||||
const (
|
||||
@@ -291,7 +419,7 @@ func (e SortDirectionEnum) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
|
||||
func (e *SortDirectionEnum) UnmarshalGQL(v any) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
@@ -307,3 +435,17 @@ func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
|
||||
func (e SortDirectionEnum) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
func (e *SortDirectionEnum) UnmarshalJSON(b []byte) error {
|
||||
s, err := strconv.Unquote(string(b))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return e.UnmarshalGQL(s)
|
||||
}
|
||||
|
||||
func (e SortDirectionEnum) MarshalJSON() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
e.MarshalGQL(&buf)
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
@@ -1,15 +1,39 @@
|
||||
package graph
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
// This file will not be regenerated automatically.
|
||||
//
|
||||
// It serves as dependency injection for your app, add any dependencies you require here.
|
||||
var (
|
||||
initOnce sync.Once
|
||||
resolverInstance *Resolver
|
||||
)
|
||||
|
||||
type Resolver struct {
|
||||
DB *sqlx.DB
|
||||
Repo *repository.JobRepository
|
||||
}
|
||||
|
||||
func Init() {
|
||||
initOnce.Do(func() {
|
||||
db := repository.GetConnection()
|
||||
resolverInstance = &Resolver{
|
||||
DB: db.DB, Repo: repository.GetJobRepository(),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func GetResolverInstance() *Resolver {
|
||||
if resolverInstance == nil {
|
||||
cclog.Fatal("Authentication module not initialized!")
|
||||
}
|
||||
|
||||
return resolverInstance
|
||||
}
|
||||
|
||||
@@ -1,24 +1,29 @@
|
||||
package graph
|
||||
|
||||
// This file will be automatically regenerated based on the schema, any resolver implementations
|
||||
// This file will be automatically regenerated based on the schema, any resolver
|
||||
// implementations
|
||||
// will be copied through when generating and any unknown code will be moved to the end.
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.45
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.84
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// Partitions is the resolver for the partitions field.
|
||||
@@ -26,26 +31,93 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) (
|
||||
return r.Repo.Partitions(obj.Name)
|
||||
}
|
||||
|
||||
// StartTime is the resolver for the startTime field.
|
||||
func (r *jobResolver) StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) {
|
||||
timestamp := time.Unix(obj.StartTime, 0)
|
||||
return ×tamp, nil
|
||||
}
|
||||
|
||||
// Tags is the resolver for the tags field.
|
||||
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
|
||||
return r.Repo.GetTags(&obj.ID)
|
||||
return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID)
|
||||
}
|
||||
|
||||
// ConcurrentJobs is the resolver for the concurrentJobs field.
|
||||
func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) {
|
||||
if obj.State == schema.JobStateRunning {
|
||||
obj.Duration = int32(time.Now().Unix() - obj.StartTimeUnix)
|
||||
}
|
||||
|
||||
if obj.Exclusive != 1 && obj.Duration > 600 {
|
||||
// FIXME: Make the hardcoded duration configurable
|
||||
if obj.Shared != "none" && obj.Duration > 600 {
|
||||
return r.Repo.FindConcurrentJobs(ctx, obj)
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Footprint is the resolver for the footprint field.
|
||||
func (r *jobResolver) Footprint(ctx context.Context, obj *schema.Job) ([]*model.FootprintValue, error) {
|
||||
rawFootprint, err := r.Repo.FetchFootprint(obj)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while fetching job footprint data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.FootprintValue{}
|
||||
for name, value := range rawFootprint {
|
||||
|
||||
parts := strings.Split(name, "_")
|
||||
statPart := parts[len(parts)-1]
|
||||
nameParts := parts[:len(parts)-1]
|
||||
|
||||
res = append(res, &model.FootprintValue{
|
||||
Name: strings.Join(nameParts, "_"),
|
||||
Stat: statPart,
|
||||
Value: value,
|
||||
})
|
||||
}
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
// EnergyFootprint is the resolver for the energyFootprint field.
|
||||
func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*model.EnergyFootprintValue, error) {
|
||||
rawEnergyFootprint, err := r.Repo.FetchEnergyFootprint(obj)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while fetching job energy footprint data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.EnergyFootprintValue{}
|
||||
for name, value := range rawEnergyFootprint {
|
||||
// Suboptimal: Nearly hardcoded metric name expectations
|
||||
matchCPU := regexp.MustCompile(`cpu|Cpu|CPU`)
|
||||
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
|
||||
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
|
||||
matchCore := regexp.MustCompile(`core|Core|CORE`)
|
||||
|
||||
hwType := ""
|
||||
switch test := name; { // NOtice ';' for var declaration
|
||||
case matchCPU.MatchString(test):
|
||||
hwType = "CPU"
|
||||
case matchAcc.MatchString(test):
|
||||
hwType = "Accelerator"
|
||||
case matchMem.MatchString(test):
|
||||
hwType = "Memory"
|
||||
case matchCore.MatchString(test):
|
||||
hwType = "Core"
|
||||
default:
|
||||
hwType = "Other"
|
||||
}
|
||||
|
||||
res = append(res, &model.EnergyFootprintValue{
|
||||
Hardware: hwType,
|
||||
Metric: name,
|
||||
Value: value,
|
||||
})
|
||||
}
|
||||
return res, err
|
||||
}
|
||||
|
||||
// MetaData is the resolver for the metaData field.
|
||||
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) {
|
||||
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (any, error) {
|
||||
return r.Repo.FetchMetadata(obj)
|
||||
}
|
||||
|
||||
@@ -54,41 +126,82 @@ func (r *jobResolver) UserData(ctx context.Context, obj *schema.Job) (*model.Use
|
||||
return repository.GetUserRepository().FetchUserInCtx(ctx, obj.User)
|
||||
}
|
||||
|
||||
// Name is the resolver for the name field.
|
||||
func (r *metricValueResolver) Name(ctx context.Context, obj *schema.MetricValue) (*string, error) {
|
||||
panic(fmt.Errorf("not implemented: Name - name"))
|
||||
}
|
||||
|
||||
// CreateTag is the resolver for the createTag field.
|
||||
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
|
||||
id, err := r.Repo.CreateTag(typeArg, name)
|
||||
if err != nil {
|
||||
log.Warn("Error while creating tag")
|
||||
return nil, err
|
||||
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string, scope string) (*schema.Tag, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user == nil {
|
||||
return nil, fmt.Errorf("no user in context")
|
||||
}
|
||||
|
||||
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
|
||||
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && scope == "admin" ||
|
||||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && scope == "global" ||
|
||||
user.Username == scope {
|
||||
// Create in DB
|
||||
id, err := r.Repo.CreateTag(typeArg, name, scope)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while creating tag")
|
||||
return nil, err
|
||||
}
|
||||
return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil
|
||||
} else {
|
||||
cclog.Warnf("Not authorized to create tag with scope: %s", scope)
|
||||
return nil, fmt.Errorf("not authorized to create tag with scope: %s", scope)
|
||||
}
|
||||
}
|
||||
|
||||
// DeleteTag is the resolver for the deleteTag field.
|
||||
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
|
||||
// This Uses ID string <-> ID string, removeTagFromList uses []string <-> []int
|
||||
panic(fmt.Errorf("not implemented: DeleteTag - deleteTag"))
|
||||
}
|
||||
|
||||
// AddTagsToJob is the resolver for the addTagsToJob field.
|
||||
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user == nil {
|
||||
return nil, fmt.Errorf("no user in context")
|
||||
}
|
||||
|
||||
jid, err := strconv.ParseInt(job, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while adding tag to job")
|
||||
cclog.Warn("Error while adding tag to job")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
for _, tagID := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagID, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing tag id")
|
||||
cclog.Warn("Error while parsing tag id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if tags, err = r.Repo.AddTag(jid, tid); err != nil {
|
||||
log.Warn("Error while adding tag")
|
||||
return nil, err
|
||||
// Test Exists
|
||||
_, _, tscope, exists := r.Repo.TagInfo(tid)
|
||||
if !exists {
|
||||
cclog.Warnf("Tag does not exist (ID): %d", tid)
|
||||
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
|
||||
}
|
||||
|
||||
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
|
||||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
|
||||
user.Username == tscope {
|
||||
// Add to Job
|
||||
if tags, err = r.Repo.AddTag(user, jid, tid); err != nil {
|
||||
cclog.Warn("Error while adding tag")
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Not authorized to add tag: %d", tid)
|
||||
return nil, fmt.Errorf("not authorized to add tag: %d", tid)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,39 +210,127 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
|
||||
|
||||
// RemoveTagsFromJob is the resolver for the removeTagsFromJob field.
|
||||
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user == nil {
|
||||
return nil, fmt.Errorf("no user in context")
|
||||
}
|
||||
|
||||
jid, err := strconv.ParseInt(job, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing job id")
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
for _, tagID := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagID, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing tag id")
|
||||
cclog.Warn("Error while parsing tag id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if tags, err = r.Repo.RemoveTag(jid, tid); err != nil {
|
||||
log.Warn("Error while removing tag")
|
||||
return nil, err
|
||||
// Test Exists
|
||||
_, _, tscope, exists := r.Repo.TagInfo(tid)
|
||||
if !exists {
|
||||
cclog.Warnf("Tag does not exist (ID): %d", tid)
|
||||
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
|
||||
}
|
||||
|
||||
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
|
||||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
|
||||
user.Username == tscope {
|
||||
// Remove from Job
|
||||
if tags, err = r.Repo.RemoveTag(user, jid, tid); err != nil {
|
||||
cclog.Warn("Error while removing tag")
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Not authorized to remove tag: %d", tid)
|
||||
return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
// RemoveTagFromList is the resolver for the removeTagFromList field.
|
||||
func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []string) ([]int, error) {
|
||||
// Needs Contextuser
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user == nil {
|
||||
return nil, fmt.Errorf("no user in context")
|
||||
}
|
||||
|
||||
tags := []int{}
|
||||
for _, tagID := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagID, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing tag id for removal")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Test Exists
|
||||
_, _, tscope, exists := r.Repo.TagInfo(tid)
|
||||
if !exists {
|
||||
cclog.Warnf("Tag does not exist (ID): %d", tid)
|
||||
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
|
||||
}
|
||||
|
||||
// Test Access: Admins && Admin Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
|
||||
// Remove from DB
|
||||
if err = r.Repo.RemoveTagById(tid); err != nil {
|
||||
cclog.Warn("Error while removing tag")
|
||||
return nil, err
|
||||
} else {
|
||||
tags = append(tags, int(tid))
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Not authorized to remove tag: %d", tid)
|
||||
return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
|
||||
}
|
||||
}
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
// UpdateConfiguration is the resolver for the updateConfiguration field.
|
||||
func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string, value string) (*string, error) {
|
||||
if err := repository.GetUserCfgRepo().UpdateConfig(name, value, repository.GetUserFromContext(ctx)); err != nil {
|
||||
log.Warn("Error while updating user config")
|
||||
cclog.Warn("Error while updating user config")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// ID is the resolver for the id field.
|
||||
func (r *nodeResolver) ID(ctx context.Context, obj *schema.Node) (string, error) {
|
||||
panic(fmt.Errorf("not implemented: ID - id"))
|
||||
}
|
||||
|
||||
// SchedulerState is the resolver for the schedulerState field.
|
||||
func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (schema.SchedulerState, error) {
|
||||
if obj.NodeState != "" {
|
||||
return obj.NodeState, nil
|
||||
} else {
|
||||
return "", fmt.Errorf("no SchedulerState (NodeState) on Object")
|
||||
}
|
||||
}
|
||||
|
||||
// HealthState is the resolver for the healthState field.
|
||||
func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (string, error) {
|
||||
panic(fmt.Errorf("not implemented: HealthState - healthState"))
|
||||
}
|
||||
|
||||
// MetaData is the resolver for the metaData field.
|
||||
func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) {
|
||||
panic(fmt.Errorf("not implemented: MetaData - metaData"))
|
||||
}
|
||||
|
||||
// Clusters is the resolver for the clusters field.
|
||||
func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error) {
|
||||
return archive.Clusters, nil
|
||||
@@ -137,7 +338,20 @@ func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error)
|
||||
|
||||
// Tags is the resolver for the tags field.
|
||||
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
|
||||
return r.Repo.GetTags(nil)
|
||||
return r.Repo.GetTags(repository.GetUserFromContext(ctx), nil)
|
||||
}
|
||||
|
||||
// GlobalMetrics is the resolver for the globalMetrics field.
|
||||
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
|
||||
if user != nil {
|
||||
if user.HasRole(schema.RoleUser) || user.HasRole(schema.RoleManager) {
|
||||
return archive.GlobalUserMetricList, nil
|
||||
}
|
||||
}
|
||||
|
||||
return archive.GlobalMetricList, nil
|
||||
}
|
||||
|
||||
// User is the resolver for the user field.
|
||||
@@ -149,7 +363,7 @@ func (r *queryResolver) User(ctx context.Context, username string) (*model.User,
|
||||
func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*model.Count, error) {
|
||||
data, err := r.Repo.AllocatedNodes(cluster)
|
||||
if err != nil {
|
||||
log.Warn("Error while fetching allocated nodes")
|
||||
cclog.Warn("Error while fetching allocated nodes")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -164,17 +378,82 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
|
||||
return counts, nil
|
||||
}
|
||||
|
||||
// Node is the resolver for the node field.
|
||||
func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
numericID, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
return repo.GetNodeByID(numericID, false)
|
||||
}
|
||||
|
||||
// Nodes is the resolver for the nodes field.
|
||||
func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
nodes, err := repo.QueryNodes(ctx, filter, nil, order) // Ignore Paging, Order Unused
|
||||
count := len(nodes)
|
||||
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
|
||||
}
|
||||
|
||||
// NodeStates is the resolver for the nodeStates field.
|
||||
func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
|
||||
stateCounts, serr := repo.CountStates(ctx, filter, "node_state")
|
||||
if serr != nil {
|
||||
cclog.Warnf("Error while counting nodeStates: %s", serr.Error())
|
||||
return nil, serr
|
||||
}
|
||||
|
||||
healthCounts, herr := repo.CountStates(ctx, filter, "health_state")
|
||||
if herr != nil {
|
||||
cclog.Warnf("Error while counting healthStates: %s", herr.Error())
|
||||
return nil, herr
|
||||
}
|
||||
|
||||
allCounts := append(stateCounts, healthCounts...)
|
||||
|
||||
return allCounts, nil
|
||||
}
|
||||
|
||||
// NodeStatesTimed is the resolver for the nodeStatesTimed field.
|
||||
func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.NodeFilter, typeArg string) ([]*model.NodeStatesTimed, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
|
||||
if typeArg == "node" {
|
||||
stateCounts, serr := repo.CountStatesTimed(ctx, filter, "node_state")
|
||||
if serr != nil {
|
||||
cclog.Warnf("Error while counting nodeStates in time: %s", serr.Error())
|
||||
return nil, serr
|
||||
}
|
||||
return stateCounts, nil
|
||||
}
|
||||
|
||||
if typeArg == "health" {
|
||||
healthCounts, herr := repo.CountStatesTimed(ctx, filter, "health_state")
|
||||
if herr != nil {
|
||||
cclog.Warnf("Error while counting healthStates in time: %s", herr.Error())
|
||||
return nil, herr
|
||||
}
|
||||
return healthCounts, nil
|
||||
}
|
||||
|
||||
return nil, errors.New("unknown Node State Query Type")
|
||||
}
|
||||
|
||||
// Job is the resolver for the job field.
|
||||
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
|
||||
numericId, err := strconv.ParseInt(id, 10, 64)
|
||||
numericID, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing job id")
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job, err := r.Repo.FindById(numericId)
|
||||
job, err := r.Repo.FindByID(ctx, numericID)
|
||||
if err != nil {
|
||||
log.Warn("Error while finding job by id")
|
||||
cclog.Warn("Error while finding job by id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -188,16 +467,26 @@ func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error)
|
||||
}
|
||||
|
||||
// JobMetrics is the resolver for the jobMetrics field.
|
||||
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.JobMetricWithName, error) {
|
||||
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope, resolution *int) ([]*model.JobMetricWithName, error) {
|
||||
if resolution == nil { // Load from Config
|
||||
if config.Keys.EnableResampling != nil {
|
||||
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
|
||||
resolution = &defaultRes
|
||||
} else { // Set 0 (Loads configured metric timestep)
|
||||
defaultRes := 0
|
||||
resolution = &defaultRes
|
||||
}
|
||||
}
|
||||
|
||||
job, err := r.Query().Job(ctx, id)
|
||||
if err != nil {
|
||||
log.Warn("Error while querying job for metrics")
|
||||
cclog.Warn("Error while querying job for metrics")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricdata.LoadData(job, metrics, scopes, ctx)
|
||||
data, err := metricdispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading job data")
|
||||
cclog.Warn("Error while loading job data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -215,9 +504,67 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
|
||||
return res, err
|
||||
}
|
||||
|
||||
// JobsFootprints is the resolver for the jobsFootprints field.
|
||||
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||
return r.jobsFootprints(ctx, filter, metrics)
|
||||
// JobStats is the resolver for the jobStats field.
|
||||
func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []string) ([]*model.NamedStats, error) {
|
||||
job, err := r.Query().Job(ctx, id)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while querying job %s for metadata", id)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricdispatcher.LoadJobStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading jobStats data for job id %s", id)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.NamedStats{}
|
||||
for name, md := range data {
|
||||
res = append(res, &model.NamedStats{
|
||||
Name: name,
|
||||
Data: &md,
|
||||
})
|
||||
}
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
// ScopedJobStats is the resolver for the scopedJobStats field.
|
||||
func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.NamedStatsWithScope, error) {
|
||||
job, err := r.Query().Job(ctx, id)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while querying job %s for metadata", id)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricdispatcher.LoadScopedJobStats(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := make([]*model.NamedStatsWithScope, 0)
|
||||
for name, scoped := range data {
|
||||
for scope, stats := range scoped {
|
||||
|
||||
mdlStats := make([]*model.ScopedStats, 0)
|
||||
for _, stat := range stats {
|
||||
mdlStats = append(mdlStats, &model.ScopedStats{
|
||||
Hostname: stat.Hostname,
|
||||
ID: stat.Id,
|
||||
Data: stat.Data,
|
||||
})
|
||||
}
|
||||
|
||||
res = append(res, &model.NamedStatsWithScope{
|
||||
Name: name,
|
||||
Scope: scope,
|
||||
Stats: mdlStats,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// Jobs is the resolver for the jobs field.
|
||||
@@ -231,41 +578,47 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
|
||||
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, page, order)
|
||||
if err != nil {
|
||||
log.Warn("Error while querying jobs")
|
||||
cclog.Warn("Error while querying jobs")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
count, err := r.Repo.CountJobs(ctx, filter)
|
||||
if err != nil {
|
||||
log.Warn("Error while counting jobs")
|
||||
cclog.Warn("Error while counting jobs")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !config.Keys.UiDefaults["job_list_usePaging"].(bool) {
|
||||
hasNextPage := false
|
||||
page.Page += 1
|
||||
|
||||
nextJobs, err := r.Repo.QueryJobs(ctx, filter, page, order)
|
||||
if err != nil {
|
||||
log.Warn("Error while querying next jobs")
|
||||
return nil, err
|
||||
}
|
||||
if len(nextJobs) > 0 {
|
||||
hasNextPage = true
|
||||
}
|
||||
|
||||
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
|
||||
} else {
|
||||
return &model.JobResultList{Items: jobs, Count: &count}, nil
|
||||
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
|
||||
// Users can decide in frontend to use continuous scroll, even if app-default is paging!
|
||||
/*
|
||||
Example Page 4 @ 10 IpP : Does item 41 exist?
|
||||
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
|
||||
*/
|
||||
nextPage := &model.PageRequest{
|
||||
ItemsPerPage: 1,
|
||||
Page: ((page.Page * page.ItemsPerPage) + 1),
|
||||
}
|
||||
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying next jobs")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
hasNextPage := len(nextJobs) == 1
|
||||
|
||||
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
|
||||
}
|
||||
|
||||
// JobsStatistics is the resolver for the jobsStatistics field.
|
||||
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
|
||||
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate, numDurationBins *string, numMetricBins *int) ([]*model.JobsStatistics, error) {
|
||||
var err error
|
||||
var stats []*model.JobsStatistics
|
||||
|
||||
if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
|
||||
// Top Level Defaults
|
||||
defaultDurationBins := "1h"
|
||||
defaultMetricBins := 10
|
||||
|
||||
if requireField(ctx, "totalJobs") || requireField(ctx, "totalUsers") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
|
||||
requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") {
|
||||
if groupBy == nil {
|
||||
stats, err = r.Repo.JobsStats(ctx, filter)
|
||||
@@ -298,8 +651,13 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
}
|
||||
|
||||
if requireField(ctx, "histDuration") || requireField(ctx, "histNumNodes") || requireField(ctx, "histNumCores") || requireField(ctx, "histNumAccs") {
|
||||
|
||||
if numDurationBins == nil {
|
||||
numDurationBins = &defaultDurationBins
|
||||
}
|
||||
|
||||
if groupBy == nil {
|
||||
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0])
|
||||
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0], numDurationBins)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -309,8 +667,13 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
}
|
||||
|
||||
if requireField(ctx, "histMetrics") {
|
||||
|
||||
if numMetricBins == nil {
|
||||
numMetricBins = &defaultMetricBins
|
||||
}
|
||||
|
||||
if groupBy == nil {
|
||||
stats[0], err = r.Repo.AddMetricHistograms(ctx, filter, metrics, stats[0])
|
||||
stats[0], err = r.Repo.AddMetricHistograms(ctx, filter, metrics, stats[0], numMetricBins)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -322,6 +685,62 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// JobsMetricStats is the resolver for the jobsMetricStats field.
|
||||
func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.JobFilter, metrics []string) ([]*model.JobStats, error) {
|
||||
// No Paging, Fixed Order by StartTime ASC
|
||||
order := &model.OrderByInput{
|
||||
Field: "startTime",
|
||||
Type: "col",
|
||||
Order: "ASC",
|
||||
}
|
||||
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, nil, order)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying jobs for comparison")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.JobStats{}
|
||||
for _, job := range jobs {
|
||||
data, err := metricdispatcher.LoadJobStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
|
||||
continue
|
||||
// return nil, err
|
||||
}
|
||||
|
||||
sres := []*model.NamedStats{}
|
||||
for name, md := range data {
|
||||
sres = append(sres, &model.NamedStats{
|
||||
Name: name,
|
||||
Data: &md,
|
||||
})
|
||||
}
|
||||
|
||||
numThreadsInt := int(job.NumHWThreads)
|
||||
numAccsInt := int(job.NumAcc)
|
||||
res = append(res, &model.JobStats{
|
||||
ID: int(*job.ID),
|
||||
JobID: strconv.Itoa(int(job.JobID)),
|
||||
StartTime: int(job.StartTime),
|
||||
Duration: int(job.Duration),
|
||||
Cluster: job.Cluster,
|
||||
SubCluster: job.SubCluster,
|
||||
NumNodes: int(job.NumNodes),
|
||||
NumHWThreads: &numThreadsInt,
|
||||
NumAccelerators: &numAccsInt,
|
||||
Stats: sres,
|
||||
})
|
||||
}
|
||||
return res, err
|
||||
}
|
||||
|
||||
// JobsFootprints is the resolver for the jobsFootprints field.
|
||||
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||
// NOTE: Legacy Naming! This resolver is for normalized histograms in analysis view only - *Not* related to DB "footprint" column!
|
||||
return r.jobsFootprints(ctx, filter, metrics)
|
||||
}
|
||||
|
||||
// RooflineHeatmap is the resolver for the rooflineHeatmap field.
|
||||
func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
|
||||
return r.rooflineHeatmap(ctx, filter, rows, cols, minX, minY, maxX, maxY)
|
||||
@@ -330,8 +749,8 @@ func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.Job
|
||||
// NodeMetrics is the resolver for the nodeMetrics field.
|
||||
func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) ([]*model.NodeMetrics, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user != nil && !user.HasRole(schema.RoleAdmin) {
|
||||
return nil, errors.New("you need to be an administrator for this query")
|
||||
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
@@ -340,19 +759,26 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
}
|
||||
}
|
||||
|
||||
data, err := metricdata.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
data, err := metricdispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading node data")
|
||||
cclog.Warn("error while loading node data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodeRepo := repository.GetNodeRepository()
|
||||
stateMap, _ := nodeRepo.MapNodes(cluster)
|
||||
|
||||
nodeMetrics := make([]*model.NodeMetrics, 0, len(data))
|
||||
for hostname, metrics := range data {
|
||||
host := &model.NodeMetrics{
|
||||
Host: hostname,
|
||||
State: stateMap[hostname],
|
||||
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
|
||||
}
|
||||
host.SubCluster, _ = archive.GetSubClusterByNode(cluster, hostname)
|
||||
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
|
||||
if err != nil {
|
||||
cclog.Warnf("error in nodeMetrics resolver: %s", err)
|
||||
}
|
||||
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for _, scopedMetric := range scopedMetrics {
|
||||
@@ -370,6 +796,152 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
return nodeMetrics, nil
|
||||
}
|
||||
|
||||
// NodeMetricsList is the resolver for the nodeMetricsList field.
|
||||
func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, subCluster string, stateFilter string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) (*model.NodesResultList, error) {
|
||||
if resolution == nil { // Load from Config
|
||||
if config.Keys.EnableResampling != nil {
|
||||
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
|
||||
resolution = &defaultRes
|
||||
} else { // Set 0 (Loads configured metric timestep)
|
||||
defaultRes := 0
|
||||
resolution = &defaultRes
|
||||
}
|
||||
}
|
||||
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
nodeRepo := repository.GetNodeRepository()
|
||||
nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page)
|
||||
if nerr != nil {
|
||||
return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := metricdispatcher.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
|
||||
if err != nil {
|
||||
cclog.Warn("error while loading node data (Resolver.NodeMetricsList")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
|
||||
for hostname, metrics := range data {
|
||||
host := &model.NodeMetrics{
|
||||
Host: hostname,
|
||||
State: stateMap[hostname],
|
||||
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
|
||||
}
|
||||
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
|
||||
if err != nil {
|
||||
cclog.Warnf("error in nodeMetrics resolver: %s", err)
|
||||
}
|
||||
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for scope, scopedMetric := range scopedMetrics {
|
||||
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
|
||||
Name: metric,
|
||||
Scope: scope,
|
||||
Metric: scopedMetric,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
nodeMetricsList = append(nodeMetricsList, host)
|
||||
}
|
||||
|
||||
nodeMetricsListResult := &model.NodesResultList{
|
||||
Items: nodeMetricsList,
|
||||
TotalNodes: &countNodes,
|
||||
HasNextPage: &hasNextPage,
|
||||
}
|
||||
|
||||
return nodeMetricsListResult, nil
|
||||
}
|
||||
|
||||
// ClusterMetrics is the resolver for the clusterMetrics field.
|
||||
func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metrics []string, from time.Time, to time.Time) (*model.ClusterMetrics, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow
|
||||
scopes := []schema.MetricScope{"node"}
|
||||
data, err := metricdispatcher.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
cclog.Warn("error while loading node data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
clusterMetricData := make([]*model.ClusterMetricWithName, 0)
|
||||
clusterMetrics := model.ClusterMetrics{NodeCount: 0, Metrics: clusterMetricData}
|
||||
|
||||
collectorTimestep := make(map[string]int)
|
||||
collectorUnit := make(map[string]schema.Unit)
|
||||
collectorData := make(map[string][]schema.Float)
|
||||
|
||||
for _, metrics := range data {
|
||||
clusterMetrics.NodeCount += 1
|
||||
for metric, scopedMetrics := range metrics {
|
||||
_, ok := collectorData[metric]
|
||||
if !ok {
|
||||
collectorData[metric] = make([]schema.Float, 0)
|
||||
for _, scopedMetric := range scopedMetrics {
|
||||
// Collect Info
|
||||
collectorTimestep[metric] = scopedMetric.Timestep
|
||||
collectorUnit[metric] = scopedMetric.Unit
|
||||
// Collect Initial Data
|
||||
for _, ser := range scopedMetric.Series {
|
||||
collectorData[metric] = append(collectorData[metric], ser.Data...)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Sum up values by index
|
||||
for _, scopedMetric := range scopedMetrics {
|
||||
// For This Purpose (Cluster_Wide-Sum of Node Metrics) OK
|
||||
for _, ser := range scopedMetric.Series {
|
||||
for i, val := range ser.Data {
|
||||
collectorData[metric][i] += val
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for metricName, data := range collectorData {
|
||||
cu := collectorUnit[metricName]
|
||||
roundedData := make([]schema.Float, 0)
|
||||
for _, val := range data {
|
||||
roundedData = append(roundedData, schema.Float((math.Round(float64(val)*100.0) / 100.0)))
|
||||
}
|
||||
|
||||
cm := model.ClusterMetricWithName{
|
||||
Name: metricName,
|
||||
Unit: &cu,
|
||||
Timestep: collectorTimestep[metricName],
|
||||
Data: roundedData,
|
||||
}
|
||||
|
||||
clusterMetrics.Metrics = append(clusterMetrics.Metrics, &cm)
|
||||
}
|
||||
|
||||
return &clusterMetrics, nil
|
||||
}
|
||||
|
||||
// NumberOfNodes is the resolver for the numberOfNodes field.
|
||||
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
|
||||
nodeList, err := archive.ParseNodeList(obj.Nodes)
|
||||
@@ -385,17 +957,27 @@ func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver
|
||||
// Job returns generated.JobResolver implementation.
|
||||
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
|
||||
|
||||
// MetricValue returns generated.MetricValueResolver implementation.
|
||||
func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricValueResolver{r} }
|
||||
|
||||
// Mutation returns generated.MutationResolver implementation.
|
||||
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
|
||||
|
||||
// Node returns generated.NodeResolver implementation.
|
||||
func (r *Resolver) Node() generated.NodeResolver { return &nodeResolver{r} }
|
||||
|
||||
// Query returns generated.QueryResolver implementation.
|
||||
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
|
||||
|
||||
// SubCluster returns generated.SubClusterResolver implementation.
|
||||
func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} }
|
||||
|
||||
type clusterResolver struct{ *Resolver }
|
||||
type jobResolver struct{ *Resolver }
|
||||
type mutationResolver struct{ *Resolver }
|
||||
type queryResolver struct{ *Resolver }
|
||||
type subClusterResolver struct{ *Resolver }
|
||||
type (
|
||||
clusterResolver struct{ *Resolver }
|
||||
jobResolver struct{ *Resolver }
|
||||
metricValueResolver struct{ *Resolver }
|
||||
mutationResolver struct{ *Resolver }
|
||||
nodeResolver struct{ *Resolver }
|
||||
queryResolver struct{ *Resolver }
|
||||
subClusterResolver struct{ *Resolver }
|
||||
)
|
||||
|
||||
@@ -1,20 +1,21 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package graph
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
// "github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatcher"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
const MAX_JOBS_FOR_ANALYSIS = 500
|
||||
@@ -24,11 +25,11 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
ctx context.Context,
|
||||
filter []*model.JobFilter,
|
||||
rows int, cols int,
|
||||
minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
|
||||
|
||||
minX float64, minY float64, maxX float64, maxY float64,
|
||||
) ([][]float64, error) {
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
||||
if err != nil {
|
||||
log.Error("Error while querying jobs for roofline")
|
||||
cclog.Error("Error while querying jobs for roofline")
|
||||
return nil, err
|
||||
}
|
||||
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
||||
@@ -47,15 +48,22 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
continue
|
||||
}
|
||||
|
||||
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
|
||||
// metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
// resolution := 0
|
||||
|
||||
// for _, mc := range metricConfigs {
|
||||
// resolution = max(resolution, mc.Timestep)
|
||||
// }
|
||||
|
||||
jobdata, err := metricdispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
|
||||
if err != nil {
|
||||
log.Errorf("Error while loading roofline metrics for job %d", job.ID)
|
||||
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
|
||||
if flops_ == nil && membw_ == nil {
|
||||
log.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
continue
|
||||
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
}
|
||||
@@ -63,7 +71,7 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
flops, ok1 := flops_["node"]
|
||||
membw, ok2 := membw_["node"]
|
||||
if !ok1 || !ok2 {
|
||||
log.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||
cclog.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||
continue
|
||||
// TODO/FIXME:
|
||||
// return nil, errors.New("GRAPH/UTIL > todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||
@@ -98,7 +106,7 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
||||
if err != nil {
|
||||
log.Error("Error while querying jobs for footprint")
|
||||
cclog.Error("Error while querying jobs for footprint")
|
||||
return nil, err
|
||||
}
|
||||
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
||||
@@ -120,8 +128,8 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
||||
continue
|
||||
}
|
||||
|
||||
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
log.Error("Error while loading averages for footprint")
|
||||
if err := metricdispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
cclog.Error("Error while loading averages for footprint")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -179,11 +187,5 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
||||
func requireField(ctx context.Context, name string) bool {
|
||||
fields := graphql.CollectAllFields(ctx)
|
||||
|
||||
for _, f := range fields {
|
||||
if f == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
return slices.Contains(fields, name)
|
||||
}
|
||||
|
||||
132
internal/importer/README.md
Normal file
132
internal/importer/README.md
Normal file
@@ -0,0 +1,132 @@
|
||||
# Importer Package
|
||||
|
||||
The `importer` package provides functionality for importing job data into the ClusterCockpit database from archived job files.
|
||||
|
||||
## Overview
|
||||
|
||||
This package supports two primary import workflows:
|
||||
|
||||
1. **Bulk Database Initialization** - Reinitialize the entire job database from archived jobs
|
||||
2. **Individual Job Import** - Import specific jobs from metadata/data file pairs
|
||||
|
||||
Both workflows enrich job metadata by calculating performance footprints and energy consumption metrics before persisting to the database.
|
||||
|
||||
## Main Entry Points
|
||||
|
||||
### InitDB()
|
||||
|
||||
Reinitializes the job database from all archived jobs.
|
||||
|
||||
```go
|
||||
if err := importer.InitDB(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
```
|
||||
|
||||
This function:
|
||||
- Flushes existing job, tag, and jobtag tables
|
||||
- Iterates through all jobs in the configured archive
|
||||
- Enriches each job with calculated metrics
|
||||
- Inserts jobs into the database in batched transactions (100 jobs per batch)
|
||||
- Continues on individual job failures, logging errors
|
||||
|
||||
**Use Case**: Initial database setup or complete database rebuild from archive.
|
||||
|
||||
### HandleImportFlag(flag string)
|
||||
|
||||
Imports jobs from specified file pairs.
|
||||
|
||||
```go
|
||||
// Format: "<meta.json>:<data.json>[,<meta2.json>:<data2.json>,...]"
|
||||
flag := "/path/to/meta.json:/path/to/data.json"
|
||||
if err := importer.HandleImportFlag(flag); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
```
|
||||
|
||||
This function:
|
||||
- Parses the comma-separated file pairs
|
||||
- Validates metadata and job data against schemas (if validation enabled)
|
||||
- Enriches each job with footprints and energy metrics
|
||||
- Imports jobs into both the archive and database
|
||||
- Fails fast on the first error
|
||||
|
||||
**Use Case**: Importing specific jobs from external sources or manual job additions.
|
||||
|
||||
## Job Enrichment
|
||||
|
||||
Both import workflows use `enrichJobMetadata()` to calculate:
|
||||
|
||||
### Performance Footprints
|
||||
|
||||
Performance footprints are calculated from metric averages based on the subcluster configuration:
|
||||
|
||||
```go
|
||||
job.Footprint["mem_used_avg"] = 45.2 // GB
|
||||
job.Footprint["cpu_load_avg"] = 0.87 // percentage
|
||||
```
|
||||
|
||||
### Energy Metrics
|
||||
|
||||
Energy consumption is calculated from power metrics using the formula:
|
||||
|
||||
```
|
||||
Energy (kWh) = (Power (W) × Duration (s) / 3600) / 1000
|
||||
```
|
||||
|
||||
For each energy metric:
|
||||
```go
|
||||
job.EnergyFootprint["acc_power"] = 12.5 // kWh
|
||||
job.Energy = 150.2 // Total energy in kWh
|
||||
```
|
||||
|
||||
**Note**: Energy calculations for metrics with unit "energy" (Joules) are not yet implemented.
|
||||
|
||||
## Data Validation
|
||||
|
||||
### SanityChecks(job *schema.Job)
|
||||
|
||||
Validates job metadata before database insertion:
|
||||
|
||||
- Cluster exists in configuration
|
||||
- Subcluster is valid (assigns if needed)
|
||||
- Job state is valid
|
||||
- Resources and user fields are populated
|
||||
- Node counts and hardware thread counts are positive
|
||||
- Resource count matches declared node count
|
||||
|
||||
## Normalization Utilities
|
||||
|
||||
The package includes utilities for normalizing metric values to appropriate SI prefixes:
|
||||
|
||||
### Normalize(avg float64, prefix string)
|
||||
|
||||
Adjusts values and SI prefixes for readability:
|
||||
|
||||
```go
|
||||
factor, newPrefix := importer.Normalize(2048.0, "M")
|
||||
// Converts 2048 MB → ~2.0 GB
|
||||
// Returns: factor for conversion, "G"
|
||||
```
|
||||
|
||||
This is useful for automatically scaling metrics (e.g., memory, storage) to human-readable units.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `github.com/ClusterCockpit/cc-backend/internal/repository` - Database operations
|
||||
- `github.com/ClusterCockpit/cc-backend/pkg/archive` - Job archive access
|
||||
- `github.com/ClusterCockpit/cc-lib/schema` - Job schema definitions
|
||||
- `github.com/ClusterCockpit/cc-lib/ccLogger` - Logging
|
||||
- `github.com/ClusterCockpit/cc-lib/ccUnits` - SI unit handling
|
||||
|
||||
## Error Handling
|
||||
|
||||
- **InitDB**: Continues processing on individual job failures, logs errors, returns summary
|
||||
- **HandleImportFlag**: Fails fast on first error, returns immediately
|
||||
- Both functions log detailed error context for debugging
|
||||
|
||||
## Performance
|
||||
|
||||
- **Transaction Batching**: InitDB processes jobs in batches of 100 for optimal database performance
|
||||
- **Tag Caching**: Tag IDs are cached during import to minimize database queries
|
||||
- **Progress Reporting**: InitDB prints progress updates during bulk operations
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package importer
|
||||
@@ -10,16 +10,30 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// Import all jobs specified as `<path-to-meta.json>:<path-to-data.json>,...`
|
||||
// HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.
|
||||
//
|
||||
// The flag format is: "<path-to-meta.json>:<path-to-data.json>[,<path-to-meta2.json>:<path-to-data2.json>,...]"
|
||||
//
|
||||
// For each job pair, this function:
|
||||
// 1. Reads and validates the metadata JSON file (schema.Job)
|
||||
// 2. Reads and validates the job data JSON file (schema.JobData)
|
||||
// 3. Enriches the job with calculated footprints and energy metrics
|
||||
// 4. Validates the job using SanityChecks()
|
||||
// 5. Imports the job into the archive
|
||||
// 6. Inserts the job into the database with associated tags
|
||||
//
|
||||
// Schema validation is performed if config.Keys.Validate is true.
|
||||
//
|
||||
// Returns an error if file reading, validation, enrichment, or database operations fail.
|
||||
// The function stops processing on the first error encountered.
|
||||
func HandleImportFlag(flag string) error {
|
||||
r := repository.GetJobRepository()
|
||||
|
||||
@@ -31,7 +45,7 @@ func HandleImportFlag(flag string) error {
|
||||
|
||||
raw, err := os.ReadFile(files[0])
|
||||
if err != nil {
|
||||
log.Warn("Error while reading metadata file for import")
|
||||
cclog.Warn("Error while reading metadata file for import")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -42,15 +56,18 @@ func HandleImportFlag(flag string) error {
|
||||
}
|
||||
dec := json.NewDecoder(bytes.NewReader(raw))
|
||||
dec.DisallowUnknownFields()
|
||||
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err = dec.Decode(&jobMeta); err != nil {
|
||||
log.Warn("Error while decoding raw json metadata for import")
|
||||
job := schema.Job{
|
||||
Shared: "none",
|
||||
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
|
||||
}
|
||||
if err = dec.Decode(&job); err != nil {
|
||||
cclog.Warn("Error while decoding raw json metadata for import")
|
||||
return err
|
||||
}
|
||||
|
||||
raw, err = os.ReadFile(files[1])
|
||||
if err != nil {
|
||||
log.Warn("Error while reading jobdata file for import")
|
||||
cclog.Warn("Error while reading jobdata file for import")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -63,72 +80,41 @@ func HandleImportFlag(flag string) error {
|
||||
dec.DisallowUnknownFields()
|
||||
jobData := schema.JobData{}
|
||||
if err = dec.Decode(&jobData); err != nil {
|
||||
log.Warn("Error while decoding raw json jobdata for import")
|
||||
cclog.Warn("Error while decoding raw json jobdata for import")
|
||||
return err
|
||||
}
|
||||
|
||||
// checkJobData(&jobData)
|
||||
job.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
|
||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
|
||||
// if _, err = r.Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
|
||||
// if err != nil {
|
||||
// log.Warn("Error while finding job in jobRepository")
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// return fmt.Errorf("REPOSITORY/INIT > a job with that jobId, cluster and startTime does already exist")
|
||||
// }
|
||||
//
|
||||
job := schema.Job{
|
||||
BaseJob: jobMeta.BaseJob,
|
||||
StartTime: time.Unix(jobMeta.StartTime, 0),
|
||||
StartTimeUnix: jobMeta.StartTime,
|
||||
}
|
||||
|
||||
// TODO: Other metrics...
|
||||
job.LoadAvg = loadJobStat(&jobMeta, "cpu_load")
|
||||
job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any")
|
||||
job.MemUsedMax = loadJobStat(&jobMeta, "mem_used")
|
||||
job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw")
|
||||
job.NetBwAvg = loadJobStat(&jobMeta, "net_bw")
|
||||
job.FileBwAvg = loadJobStat(&jobMeta, "file_bw")
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
log.Warn("Error while marshaling job resources")
|
||||
return err
|
||||
}
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
log.Warn("Error while marshaling job metadata")
|
||||
if err = enrichJobMetadata(&job); err != nil {
|
||||
cclog.Errorf("Error enriching job metadata: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
if err = SanityChecks(&job.BaseJob); err != nil {
|
||||
log.Warn("BaseJob SanityChecks failed")
|
||||
if err = SanityChecks(&job); err != nil {
|
||||
cclog.Warn("BaseJob SanityChecks failed")
|
||||
return err
|
||||
}
|
||||
|
||||
if err = archive.GetHandle().ImportJob(&jobMeta, &jobData); err != nil {
|
||||
log.Error("Error while importing job")
|
||||
if err = archive.GetHandle().ImportJob(&job, &jobData); err != nil {
|
||||
cclog.Error("Error while importing job")
|
||||
return err
|
||||
}
|
||||
|
||||
id, err := r.InsertJob(&job)
|
||||
if err != nil {
|
||||
log.Warn("Error while job db insert")
|
||||
cclog.Warn("Error while job db insert")
|
||||
return err
|
||||
}
|
||||
|
||||
for _, tag := range job.Tags {
|
||||
if _, err := r.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
|
||||
log.Error("Error while adding or creating tag")
|
||||
if err := r.ImportTag(id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||
cclog.Error("Error while adding or creating tag on import")
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
|
||||
cclog.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package importer_test
|
||||
@@ -16,9 +16,12 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
// copyFile copies a file from source path to destination path.
|
||||
// Used by tests to set up test fixtures.
|
||||
func copyFile(s string, d string) error {
|
||||
r, err := os.Open(s)
|
||||
if err != nil {
|
||||
@@ -34,21 +37,29 @@ func copyFile(s string, d string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// setup initializes a test environment for importer tests.
|
||||
//
|
||||
// Creates a temporary directory with:
|
||||
// - A test job archive with cluster configuration
|
||||
// - A SQLite database initialized with schema
|
||||
// - Configuration files loaded
|
||||
//
|
||||
// Returns a JobRepository instance for test assertions.
|
||||
func setup(t *testing.T) *repository.JobRepository {
|
||||
const testconfig = `{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
]},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "testcluster",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 64 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
@@ -57,7 +68,6 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
},
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 944 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
@@ -66,7 +76,6 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
},
|
||||
{
|
||||
"name": "taurus",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 4000 },
|
||||
"duration": { "from": 0, "to": 604800 },
|
||||
@@ -75,18 +84,18 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
}
|
||||
]}`
|
||||
|
||||
log.Init("info", true)
|
||||
cclog.Init("info", true)
|
||||
tmpdir := t.TempDir()
|
||||
|
||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||
if err := os.Mkdir(jobarchive, 0777); err != nil {
|
||||
if err := os.Mkdir(jobarchive, 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
fritzArchive := filepath.Join(tmpdir, "job-archive", "fritz")
|
||||
if err := os.Mkdir(fritzArchive, 0777); err != nil {
|
||||
if err := os.Mkdir(fritzArchive, 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := copyFile(filepath.Join("testdata", "cluster-fritz.json"),
|
||||
@@ -95,17 +104,29 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
}
|
||||
|
||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||
err := repository.MigrateDB("sqlite3", dbfilepath)
|
||||
err := repository.MigrateDB(dbfilepath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0666); err != nil {
|
||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
config.Init(cfgFilePath)
|
||||
ccconf.Init(cfgFilePath)
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
t.Fatal("Cluster configuration must be present")
|
||||
}
|
||||
} else {
|
||||
t.Fatal("Main configuration must be present")
|
||||
}
|
||||
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
@@ -116,6 +137,7 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
return repository.GetJobRepository()
|
||||
}
|
||||
|
||||
// Result represents the expected test result for job import verification.
|
||||
type Result struct {
|
||||
JobId int64
|
||||
Cluster string
|
||||
@@ -123,6 +145,8 @@ type Result struct {
|
||||
Duration int32
|
||||
}
|
||||
|
||||
// readResult reads the expected test result from a golden file.
|
||||
// Golden files contain the expected job attributes after import.
|
||||
func readResult(t *testing.T, testname string) Result {
|
||||
var r Result
|
||||
|
||||
@@ -140,6 +164,13 @@ func readResult(t *testing.T, testname string) Result {
|
||||
return r
|
||||
}
|
||||
|
||||
// TestHandleImportFlag tests the HandleImportFlag function with various job import scenarios.
|
||||
//
|
||||
// The test uses golden files in testdata/ to verify that jobs are correctly:
|
||||
// - Parsed from metadata and data JSON files
|
||||
// - Enriched with footprints and energy metrics
|
||||
// - Inserted into the database
|
||||
// - Retrievable with correct attributes
|
||||
func TestHandleImportFlag(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
@@ -163,7 +194,7 @@ func TestHandleImportFlag(t *testing.T) {
|
||||
}
|
||||
|
||||
result := readResult(t, testname)
|
||||
job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime)
|
||||
job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -1,40 +1,68 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package importer provides functionality for importing job data into the ClusterCockpit database.
|
||||
//
|
||||
// The package supports two primary use cases:
|
||||
// 1. Bulk database initialization from archived jobs via InitDB()
|
||||
// 2. Individual job import from file pairs via HandleImportFlag()
|
||||
//
|
||||
// Both operations enrich job metadata by calculating footprints and energy metrics
|
||||
// before persisting to the database.
|
||||
package importer
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// Delete the tables "job", "tag" and "jobtag" from the database and
|
||||
// repopulate them using the jobs found in `archive`.
|
||||
const (
|
||||
addTagQuery = "INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)"
|
||||
setTagQuery = "INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)"
|
||||
)
|
||||
|
||||
// InitDB reinitializes the job database from archived job data.
|
||||
//
|
||||
// This function performs the following operations:
|
||||
// 1. Flushes existing job, tag, and jobtag tables
|
||||
// 2. Iterates through all jobs in the archive
|
||||
// 3. Enriches each job with calculated footprints and energy metrics
|
||||
// 4. Inserts jobs and tags into the database in batched transactions
|
||||
//
|
||||
// Jobs are processed in batches of 100 for optimal performance. The function
|
||||
// continues processing even if individual jobs fail, logging errors and
|
||||
// returning a summary at the end.
|
||||
//
|
||||
// Returns an error if database initialization, transaction management, or
|
||||
// critical operations fail. Individual job failures are logged but do not
|
||||
// stop the overall import process.
|
||||
func InitDB() error {
|
||||
r := repository.GetJobRepository()
|
||||
if err := r.Flush(); err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
return err
|
||||
}
|
||||
starttime := time.Now()
|
||||
log.Print("Building job table...")
|
||||
cclog.Print("Building job table...")
|
||||
|
||||
t, err := r.TransactionInit()
|
||||
if err != nil {
|
||||
log.Warn("Error while initializing SQL transactions")
|
||||
cclog.Warn("Error while initializing SQL transactions")
|
||||
return err
|
||||
}
|
||||
tags := make(map[string]int64)
|
||||
|
||||
// Not using log.Print because we want the line to end with `\r` and
|
||||
// Not using cclog.Print because we want the line to end with `\r` and
|
||||
// this function is only ever called when a special command line flag
|
||||
// is passed anyways.
|
||||
fmt.Printf("%d jobs inserted...\r", 0)
|
||||
@@ -46,92 +74,195 @@ func InitDB() error {
|
||||
for jobContainer := range ar.Iter(false) {
|
||||
|
||||
jobMeta := jobContainer.Meta
|
||||
if jobMeta == nil {
|
||||
cclog.Warn("skipping job with nil metadata")
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
// Bundle 100 inserts into one transaction for better performance
|
||||
if i%100 == 0 {
|
||||
r.TransactionCommit(t)
|
||||
if i > 0 {
|
||||
if err := t.Commit(); err != nil {
|
||||
cclog.Errorf("transaction commit error: %v", err)
|
||||
return err
|
||||
}
|
||||
// Start a new transaction for the next batch
|
||||
t, err = r.TransactionInit()
|
||||
if err != nil {
|
||||
cclog.Errorf("transaction init error: %v", err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
fmt.Printf("%d jobs inserted...\r", i)
|
||||
}
|
||||
|
||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
job := schema.Job{
|
||||
BaseJob: jobMeta.BaseJob,
|
||||
StartTime: time.Unix(jobMeta.StartTime, 0),
|
||||
StartTimeUnix: jobMeta.StartTime,
|
||||
}
|
||||
|
||||
// TODO: Other metrics...
|
||||
job.LoadAvg = loadJobStat(jobMeta, "cpu_load")
|
||||
job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any")
|
||||
job.MemUsedMax = loadJobStat(jobMeta, "mem_used")
|
||||
job.MemBwAvg = loadJobStat(jobMeta, "mem_bw")
|
||||
job.NetBwAvg = loadJobStat(jobMeta, "net_bw")
|
||||
job.FileBwAvg = loadJobStat(jobMeta, "file_bw")
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
if err := enrichJobMetadata(jobMeta); err != nil {
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
if err := SanityChecks(jobMeta); err != nil {
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
if err := SanityChecks(&job.BaseJob); err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
id, jobErr := r.TransactionAddNamed(t,
|
||||
repository.NamedJobInsert, jobMeta)
|
||||
if jobErr != nil {
|
||||
cclog.Errorf("repository initDB(): %v", jobErr)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
id, err := r.TransactionAdd(t, job)
|
||||
if err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
// Job successfully inserted, increment counter
|
||||
i += 1
|
||||
|
||||
for _, tag := range job.Tags {
|
||||
for _, tag := range jobMeta.Tags {
|
||||
tagstr := tag.Name + ":" + tag.Type
|
||||
tagId, ok := tags[tagstr]
|
||||
tagID, ok := tags[tagstr]
|
||||
if !ok {
|
||||
tagId, err = r.TransactionAddTag(t, tag)
|
||||
var err error
|
||||
tagID, err = r.TransactionAdd(t,
|
||||
addTagQuery,
|
||||
tag.Name, tag.Type)
|
||||
if err != nil {
|
||||
log.Errorf("Error adding tag: %v", err)
|
||||
cclog.Errorf("Error adding tag: %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
tags[tagstr] = tagId
|
||||
tags[tagstr] = tagID
|
||||
}
|
||||
|
||||
r.TransactionSetTag(t, id, tagId)
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
i += 1
|
||||
r.TransactionAdd(t,
|
||||
setTagQuery,
|
||||
id, tagID)
|
||||
}
|
||||
}
|
||||
|
||||
if errorOccured > 0 {
|
||||
log.Warnf("Error in import of %d jobs!", errorOccured)
|
||||
cclog.Warnf("Error in import of %d jobs!", errorOccured)
|
||||
}
|
||||
|
||||
r.TransactionEnd(t)
|
||||
log.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
|
||||
cclog.Infof("A total of %d jobs have been registered in %.3f seconds.", i, time.Since(starttime).Seconds())
|
||||
return nil
|
||||
}
|
||||
|
||||
// This function also sets the subcluster if necessary!
|
||||
func SanityChecks(job *schema.BaseJob) error {
|
||||
// enrichJobMetadata calculates and populates job footprints, energy metrics, and serialized fields.
|
||||
//
|
||||
// This function performs the following enrichment operations:
|
||||
// 1. Calculates job footprint metrics based on the subcluster configuration
|
||||
// 2. Computes energy footprint and total energy consumption in kWh
|
||||
// 3. Marshals footprints, resources, and metadata into JSON for database storage
|
||||
//
|
||||
// The function expects the job's MonitoringStatus and SubCluster to be already set.
|
||||
// Energy calculations convert power metrics (Watts) to energy (kWh) using the formula:
|
||||
//
|
||||
// Energy (kWh) = (Power (W) * Duration (s) / 3600) / 1000
|
||||
//
|
||||
// Returns an error if subcluster retrieval, metric indexing, or JSON marshaling fails.
|
||||
func enrichJobMetadata(job *schema.Job) error {
|
||||
sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
job.Footprint = make(map[string]float64)
|
||||
|
||||
for _, fp := range sc.Footprint {
|
||||
statType := "avg"
|
||||
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
|
||||
statType = sc.MetricConfig[i].Footprint
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||
|
||||
job.Footprint[name] = repository.LoadJobStat(job, fp, statType)
|
||||
}
|
||||
|
||||
job.RawFootprint, err = json.Marshal(job.Footprint)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while marshaling job footprint")
|
||||
return err
|
||||
}
|
||||
|
||||
job.EnergyFootprint = make(map[string]float64)
|
||||
|
||||
// Total Job Energy Outside Loop
|
||||
totalEnergy := 0.0
|
||||
for _, fp := range sc.EnergyFootprint {
|
||||
// Always Init Metric Energy Inside Loop
|
||||
metricEnergy := 0.0
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
|
||||
// Note: For DB data, calculate and save as kWh
|
||||
switch sc.MetricConfig[i].Energy {
|
||||
case "energy": // this metric has energy as unit (Joules)
|
||||
cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", job.JobID, job.Cluster, fp)
|
||||
// FIXME: Needs sum as stats type
|
||||
case "power": // this metric has power as unit (Watt)
|
||||
// Energy: Power (in Watts) * Time (in Seconds)
|
||||
// Unit: (W * (s / 3600)) / 1000 = kWh
|
||||
// Round 2 Digits: round(Energy * 100) / 100
|
||||
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
|
||||
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
|
||||
rawEnergy := ((repository.LoadJobStat(job, fp, "avg") * float64(job.NumNodes)) * (float64(job.Duration) / 3600.0)) / 1000.0
|
||||
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
|
||||
}
|
||||
|
||||
job.EnergyFootprint[fp] = metricEnergy
|
||||
totalEnergy += metricEnergy
|
||||
}
|
||||
|
||||
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
|
||||
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
|
||||
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while marshaling job resources")
|
||||
return err
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while marshaling job metadata")
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// SanityChecks validates job metadata and ensures cluster/subcluster configuration is valid.
|
||||
//
|
||||
// This function performs the following validations:
|
||||
// 1. Verifies the cluster exists in the archive configuration
|
||||
// 2. Assigns and validates the subcluster (may modify job.SubCluster)
|
||||
// 3. Validates job state is a recognized value
|
||||
// 4. Ensures resources and user fields are populated
|
||||
// 5. Validates node counts and hardware thread counts are positive
|
||||
// 6. Verifies the number of resources matches the declared node count
|
||||
//
|
||||
// The function may modify the job's SubCluster field if it needs to be assigned.
|
||||
//
|
||||
// Returns an error if any validation check fails.
|
||||
func SanityChecks(job *schema.Job) error {
|
||||
if c := archive.GetCluster(job.Cluster); c == nil {
|
||||
return fmt.Errorf("no such cluster: %v", job.Cluster)
|
||||
}
|
||||
if err := archive.AssignSubCluster(job); err != nil {
|
||||
log.Warn("Error while assigning subcluster to job")
|
||||
cclog.Warn("Error while assigning subcluster to job")
|
||||
return err
|
||||
}
|
||||
if !job.State.Valid() {
|
||||
@@ -150,18 +281,14 @@ func SanityChecks(job *schema.BaseJob) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func loadJobStat(job *schema.JobMeta, metric string) float64 {
|
||||
if stats, ok := job.Statistics[metric]; ok {
|
||||
if metric == "mem_used" {
|
||||
return stats.Max
|
||||
} else {
|
||||
return stats.Avg
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// checkJobData normalizes metric units in job data based on average values.
|
||||
//
|
||||
// NOTE: This function is currently unused and contains incomplete implementation.
|
||||
// It was intended to normalize byte and file-related metrics to appropriate SI prefixes,
|
||||
// but the normalization logic is commented out. Consider removing or completing this
|
||||
// function based on project requirements.
|
||||
//
|
||||
// TODO: Either implement the metric normalization or remove this dead code.
|
||||
func checkJobData(d *schema.JobData) error {
|
||||
for _, scopes := range *d {
|
||||
// var newUnit schema.Unit
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package importer
|
||||
@@ -7,13 +7,27 @@ package importer
|
||||
import (
|
||||
"math"
|
||||
|
||||
ccunits "github.com/ClusterCockpit/cc-units"
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
|
||||
)
|
||||
|
||||
// getNormalizationFactor calculates the scaling factor needed to normalize a value
|
||||
// to a more readable range (typically between 1.0 and 1000.0).
|
||||
//
|
||||
// For values greater than 1000, the function scales down by factors of 1000 (returns negative exponent).
|
||||
// For values less than 1.0, the function scales up by factors of 1000 (returns positive exponent).
|
||||
//
|
||||
// Returns:
|
||||
// - factor: The multiplicative factor to apply (10^(count*scale))
|
||||
// - exponent: The power of 10 representing the adjustment (multiple of 3 for SI prefixes)
|
||||
func getNormalizationFactor(v float64) (float64, int) {
|
||||
count := 0
|
||||
scale := -3
|
||||
|
||||
// Prevent infinite loop for zero or negative values
|
||||
if v <= 0.0 {
|
||||
return 1.0, 0
|
||||
}
|
||||
|
||||
if v > 1000.0 {
|
||||
for v > 1000.0 {
|
||||
v *= 1e-3
|
||||
@@ -29,9 +43,22 @@ func getNormalizationFactor(v float64) (float64, int) {
|
||||
return math.Pow10(count * scale), count * scale
|
||||
}
|
||||
|
||||
// getExponent calculates the SI prefix exponent from a numeric prefix value.
|
||||
//
|
||||
// For example:
|
||||
// - Input: 1000.0 (kilo) returns 3
|
||||
// - Input: 1000000.0 (mega) returns 6
|
||||
// - Input: 1000000000.0 (giga) returns 9
|
||||
//
|
||||
// Returns the exponent representing the power of 10 for the SI prefix.
|
||||
func getExponent(p float64) int {
|
||||
count := 0
|
||||
|
||||
// Prevent infinite loop for infinity or NaN values
|
||||
if math.IsInf(p, 0) || math.IsNaN(p) || p <= 0.0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
for p > 1.0 {
|
||||
p = p / 1000.0
|
||||
count++
|
||||
@@ -40,12 +67,42 @@ func getExponent(p float64) int {
|
||||
return count * 3
|
||||
}
|
||||
|
||||
// newPrefixFromFactor computes a new SI unit prefix after applying a normalization factor.
|
||||
//
|
||||
// Given an original prefix and an exponent adjustment, this function calculates
|
||||
// the resulting SI prefix. For example, if normalizing from bytes (no prefix) by
|
||||
// a factor of 10^9, the result would be the "G" (giga) prefix.
|
||||
//
|
||||
// Parameters:
|
||||
// - op: The original SI prefix value
|
||||
// - e: The exponent adjustment to apply
|
||||
//
|
||||
// Returns the new SI prefix after adjustment.
|
||||
func newPrefixFromFactor(op ccunits.Prefix, e int) ccunits.Prefix {
|
||||
f := float64(op)
|
||||
exp := math.Pow10(getExponent(f) - e)
|
||||
return ccunits.Prefix(exp)
|
||||
}
|
||||
|
||||
// Normalize adjusts a metric value and its SI unit prefix to a more readable range.
|
||||
//
|
||||
// This function is useful for automatically scaling metrics to appropriate units.
|
||||
// For example, normalizing 2048 MiB might result in ~2.0 GiB.
|
||||
//
|
||||
// The function analyzes the average value and determines if a different SI prefix
|
||||
// would make the number more human-readable (typically keeping values between 1 and 1000).
|
||||
//
|
||||
// Parameters:
|
||||
// - avg: The metric value to normalize
|
||||
// - p: The current SI prefix as a string (e.g., "K", "M", "G")
|
||||
//
|
||||
// Returns:
|
||||
// - factor: The multiplicative factor to apply to convert the value
|
||||
// - newPrefix: The new SI prefix string to use
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// factor, newPrefix := Normalize(2048.0, "M") // returns factor for MB->GB conversion, "G"
|
||||
func Normalize(avg float64, p string) (float64, string) {
|
||||
f, e := getNormalizationFactor(avg)
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package importer
|
||||
@@ -8,9 +8,11 @@ import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
ccunits "github.com/ClusterCockpit/cc-units"
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
|
||||
)
|
||||
|
||||
// TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix.
|
||||
// Verifies that values in the billions are correctly scaled to the "G" (giga) prefix.
|
||||
func TestNormalizeFactor(t *testing.T) {
|
||||
// var us string
|
||||
s := []float64{2890031237, 23998994567, 389734042344, 390349424345}
|
||||
@@ -38,6 +40,8 @@ func TestNormalizeFactor(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestNormalizeKeep tests that values already in an appropriate range maintain their prefix.
|
||||
// Verifies that when values don't require rescaling, the original "G" prefix is preserved.
|
||||
func TestNormalizeKeep(t *testing.T) {
|
||||
s := []float64{3.0, 24.0, 390.0, 391.0}
|
||||
|
||||
|
||||
1486
internal/importer/testdata/cluster-fritz.json
vendored
1486
internal/importer/testdata/cluster-fritz.json
vendored
File diff suppressed because it is too large
Load Diff
@@ -1 +1 @@
|
||||
{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"exclusive":1,"monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}}
|
||||
{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"shared":"none","monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}}
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"exclusive":1,"jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}}
|
||||
{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"shared":"none","jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}}
|
||||
|
||||
232
internal/memorystore/api.go
Normal file
232
internal/memorystore/api.go
Normal file
@@ -0,0 +1,232 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'")
|
||||
ErrEmptyCluster = errors.New("[METRICSTORE]> cluster name cannot be empty")
|
||||
)
|
||||
|
||||
type APIMetricData struct {
|
||||
Error *string `json:"error,omitempty"`
|
||||
Data schema.FloatArray `json:"data,omitempty"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
Resolution int64 `json:"resolution"`
|
||||
Avg schema.Float `json:"avg"`
|
||||
Min schema.Float `json:"min"`
|
||||
Max schema.Float `json:"max"`
|
||||
}
|
||||
|
||||
type APIQueryRequest struct {
|
||||
Cluster string `json:"cluster"`
|
||||
Queries []APIQuery `json:"queries"`
|
||||
ForAllNodes []string `json:"for-all-nodes"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
WithStats bool `json:"with-stats"`
|
||||
WithData bool `json:"with-data"`
|
||||
WithPadding bool `json:"with-padding"`
|
||||
}
|
||||
|
||||
type APIQueryResponse struct {
|
||||
Queries []APIQuery `json:"queries,omitempty"`
|
||||
Results [][]APIMetricData `json:"results"`
|
||||
}
|
||||
|
||||
type APIQuery struct {
|
||||
Type *string `json:"type,omitempty"`
|
||||
SubType *string `json:"subtype,omitempty"`
|
||||
Metric string `json:"metric"`
|
||||
Hostname string `json:"host"`
|
||||
Resolution int64 `json:"resolution"`
|
||||
TypeIds []string `json:"type-ids,omitempty"`
|
||||
SubTypeIds []string `json:"subtype-ids,omitempty"`
|
||||
ScaleFactor schema.Float `json:"scale-by,omitempty"`
|
||||
Aggregate bool `json:"aggreg"`
|
||||
}
|
||||
|
||||
// TODO: Optimize this, just like the stats endpoint!
|
||||
func (data *APIMetricData) AddStats() {
|
||||
n := 0
|
||||
sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64
|
||||
for _, x := range data.Data {
|
||||
if x.IsNaN() {
|
||||
continue
|
||||
}
|
||||
|
||||
n += 1
|
||||
sum += float64(x)
|
||||
min = math.Min(min, float64(x))
|
||||
max = math.Max(max, float64(x))
|
||||
}
|
||||
|
||||
if n > 0 {
|
||||
avg := sum / float64(n)
|
||||
data.Avg = schema.Float(avg)
|
||||
data.Min = schema.Float(min)
|
||||
data.Max = schema.Float(max)
|
||||
} else {
|
||||
data.Avg, data.Min, data.Max = schema.NaN, schema.NaN, schema.NaN
|
||||
}
|
||||
}
|
||||
|
||||
func (data *APIMetricData) ScaleBy(f schema.Float) {
|
||||
if f == 0 || f == 1 {
|
||||
return
|
||||
}
|
||||
|
||||
data.Avg *= f
|
||||
data.Min *= f
|
||||
data.Max *= f
|
||||
for i := 0; i < len(data.Data); i++ {
|
||||
data.Data[i] *= f
|
||||
}
|
||||
}
|
||||
|
||||
func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) {
|
||||
minfo, ok := ms.Metrics[metric]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
if (data.From / minfo.Frequency) > (from / minfo.Frequency) {
|
||||
padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency))
|
||||
ndata := make([]schema.Float, 0, padfront+len(data.Data))
|
||||
for range padfront {
|
||||
ndata = append(ndata, schema.NaN)
|
||||
}
|
||||
for j := 0; j < len(data.Data); j++ {
|
||||
ndata = append(ndata, data.Data[j])
|
||||
}
|
||||
data.Data = ndata
|
||||
}
|
||||
}
|
||||
|
||||
func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
|
||||
if req.From > req.To {
|
||||
return nil, ErrInvalidTimeRange
|
||||
}
|
||||
if req.Cluster == "" && req.ForAllNodes != nil {
|
||||
return nil, ErrEmptyCluster
|
||||
}
|
||||
|
||||
req.WithData = true
|
||||
ms := GetMemoryStore()
|
||||
if ms == nil {
|
||||
return nil, fmt.Errorf("memorystore not initialized")
|
||||
}
|
||||
|
||||
|
||||
response := APIQueryResponse{
|
||||
Results: make([][]APIMetricData, 0, len(req.Queries)),
|
||||
}
|
||||
if req.ForAllNodes != nil {
|
||||
nodes := ms.ListChildren([]string{req.Cluster})
|
||||
for _, node := range nodes {
|
||||
for _, metric := range req.ForAllNodes {
|
||||
q := APIQuery{
|
||||
Metric: metric,
|
||||
Hostname: node,
|
||||
}
|
||||
req.Queries = append(req.Queries, q)
|
||||
response.Queries = append(response.Queries, q)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, query := range req.Queries {
|
||||
sels := make([]util.Selector, 0, 1)
|
||||
if query.Aggregate || query.Type == nil {
|
||||
sel := util.Selector{{String: req.Cluster}, {String: query.Hostname}}
|
||||
if query.Type != nil {
|
||||
if len(query.TypeIds) == 1 {
|
||||
sel = append(sel, util.SelectorElement{String: *query.Type + query.TypeIds[0]})
|
||||
} else {
|
||||
ids := make([]string, len(query.TypeIds))
|
||||
for i, id := range query.TypeIds {
|
||||
ids[i] = *query.Type + id
|
||||
}
|
||||
sel = append(sel, util.SelectorElement{Group: ids})
|
||||
}
|
||||
|
||||
if query.SubType != nil {
|
||||
if len(query.SubTypeIds) == 1 {
|
||||
sel = append(sel, util.SelectorElement{String: *query.SubType + query.SubTypeIds[0]})
|
||||
} else {
|
||||
ids := make([]string, len(query.SubTypeIds))
|
||||
for i, id := range query.SubTypeIds {
|
||||
ids[i] = *query.SubType + id
|
||||
}
|
||||
sel = append(sel, util.SelectorElement{Group: ids})
|
||||
}
|
||||
}
|
||||
}
|
||||
sels = append(sels, sel)
|
||||
} else {
|
||||
for _, typeID := range query.TypeIds {
|
||||
if query.SubType != nil {
|
||||
for _, subTypeID := range query.SubTypeIds {
|
||||
sels = append(sels, util.Selector{
|
||||
{String: req.Cluster},
|
||||
{String: query.Hostname},
|
||||
{String: *query.Type + typeID},
|
||||
{String: *query.SubType + subTypeID},
|
||||
})
|
||||
}
|
||||
} else {
|
||||
sels = append(sels, util.Selector{
|
||||
{String: req.Cluster},
|
||||
{String: query.Hostname},
|
||||
{String: *query.Type + typeID},
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// log.Printf("query: %#v\n", query)
|
||||
// log.Printf("sels: %#v\n", sels)
|
||||
var err error
|
||||
res := make([]APIMetricData, 0, len(sels))
|
||||
for _, sel := range sels {
|
||||
data := APIMetricData{}
|
||||
|
||||
data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution)
|
||||
if err != nil {
|
||||
msg := err.Error()
|
||||
data.Error = &msg
|
||||
res = append(res, data)
|
||||
continue
|
||||
}
|
||||
|
||||
if req.WithStats {
|
||||
data.AddStats()
|
||||
}
|
||||
if query.ScaleFactor != 0 {
|
||||
data.ScaleBy(query.ScaleFactor)
|
||||
}
|
||||
if req.WithPadding {
|
||||
data.PadDataWithNull(ms, req.From, req.To, query.Metric)
|
||||
}
|
||||
if !req.WithData {
|
||||
data.Data = nil
|
||||
}
|
||||
res = append(res, data)
|
||||
}
|
||||
response.Results = append(response.Results, res)
|
||||
}
|
||||
|
||||
return &response, nil
|
||||
}
|
||||
196
internal/memorystore/archive.go
Normal file
196
internal/memorystore/archive.go
Normal file
@@ -0,0 +1,196 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bufio"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func Archiving(wg *sync.WaitGroup, ctx context.Context) {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.Archive.Interval)
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(d)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
t := time.Now().Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...", t.Format(time.RFC3339))
|
||||
n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir,
|
||||
Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead)
|
||||
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> archiving failed: %s", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var ErrNoNewArchiveData error = errors.New("all data already archived")
|
||||
|
||||
// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`,
|
||||
// deleting them from the `checkpointsDir`.
|
||||
func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) {
|
||||
entries1, err := os.ReadDir(checkpointsDir)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
type workItem struct {
|
||||
cdir, adir string
|
||||
cluster, host string
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
n, errs := int32(0), int32(0)
|
||||
work := make(chan workItem, Keys.NumWorkers)
|
||||
|
||||
wg.Add(Keys.NumWorkers)
|
||||
for worker := 0; worker < Keys.NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for workItem := range work {
|
||||
m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
}
|
||||
atomic.AddInt32(&n, int32(m))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for _, de1 := range entries1 {
|
||||
entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
|
||||
if e != nil {
|
||||
err = e
|
||||
}
|
||||
|
||||
for _, de2 := range entries2 {
|
||||
cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
|
||||
adir := filepath.Join(archiveDir, de1.Name(), de2.Name())
|
||||
work <- workItem{
|
||||
adir: adir, cdir: cdir,
|
||||
cluster: de1.Name(), host: de2.Name(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return int(n), err
|
||||
}
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// Helper function for `ArchiveCheckpoints`.
|
||||
func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) {
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
extension := Keys.Checkpoints.FileFormat
|
||||
files, err := findFiles(entries, from, extension, false)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if deleteInstead {
|
||||
n := 0
|
||||
for _, checkpoint := range files {
|
||||
filename := filepath.Join(dir, checkpoint)
|
||||
if err = os.Remove(filename); err != nil {
|
||||
return n, err
|
||||
}
|
||||
n += 1
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from))
|
||||
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
err = os.MkdirAll(archiveDir, CheckpointDirPerms)
|
||||
if err == nil {
|
||||
f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
bw := bufio.NewWriter(f)
|
||||
defer bw.Flush()
|
||||
zw := zip.NewWriter(bw)
|
||||
defer zw.Close()
|
||||
|
||||
n := 0
|
||||
for _, checkpoint := range files {
|
||||
// Use closure to ensure file is closed immediately after use,
|
||||
// avoiding file descriptor leak from defer in loop
|
||||
err := func() error {
|
||||
filename := filepath.Join(dir, checkpoint)
|
||||
r, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
w, err := zw.Create(checkpoint)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err = io.Copy(w, r); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err = os.Remove(filename); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}()
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
n += 1
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
477
internal/memorystore/avroCheckpoint.go
Normal file
477
internal/memorystore/avroCheckpoint.go
Normal file
@@ -0,0 +1,477 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/linkedin/goavro/v2"
|
||||
)
|
||||
|
||||
var NumAvroWorkers int = DefaultAvroWorkers
|
||||
var startUp bool = true
|
||||
|
||||
func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
|
||||
levels := make([]*AvroLevel, 0)
|
||||
selectors := make([][]string, 0)
|
||||
as.root.lock.RLock()
|
||||
// Cluster
|
||||
for sel1, l1 := range as.root.children {
|
||||
l1.lock.RLock()
|
||||
// Node
|
||||
for sel2, l2 := range l1.children {
|
||||
l2.lock.RLock()
|
||||
// Frequency
|
||||
for sel3, l3 := range l2.children {
|
||||
levels = append(levels, l3)
|
||||
selectors = append(selectors, []string{sel1, sel2, sel3})
|
||||
}
|
||||
l2.lock.RUnlock()
|
||||
}
|
||||
l1.lock.RUnlock()
|
||||
}
|
||||
as.root.lock.RUnlock()
|
||||
|
||||
type workItem struct {
|
||||
level *AvroLevel
|
||||
dir string
|
||||
selector []string
|
||||
}
|
||||
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(NumAvroWorkers)
|
||||
work := make(chan workItem, NumAvroWorkers*2)
|
||||
for range NumAvroWorkers {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
for workItem := range work {
|
||||
from := getTimestamp(workItem.dir)
|
||||
|
||||
if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil {
|
||||
if err == ErrNoNewArchiveData {
|
||||
continue
|
||||
}
|
||||
|
||||
cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
} else {
|
||||
atomic.AddInt32(&n, 1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for i := range len(levels) {
|
||||
dir := path.Join(dir, path.Join(selectors[i]...))
|
||||
work <- workItem{
|
||||
level: levels[i],
|
||||
dir: dir,
|
||||
selector: selectors[i],
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
|
||||
startUp = false
|
||||
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// getTimestamp returns the timestamp from the directory name
|
||||
func getTimestamp(dir string) int64 {
|
||||
// Extract the resolution and timestamp from the directory name
|
||||
// The existing avro file will be in epoch timestamp format
|
||||
// iterate over all the files in the directory and find the maximum timestamp
|
||||
// and return it
|
||||
|
||||
resolution := path.Base(dir)
|
||||
dir = path.Dir(dir)
|
||||
|
||||
files, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
var maxTS int64 = 0
|
||||
|
||||
if len(files) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
if file.IsDir() {
|
||||
continue
|
||||
}
|
||||
name := file.Name()
|
||||
|
||||
if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") {
|
||||
continue
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64)
|
||||
if err != nil {
|
||||
fmt.Printf("error while parsing timestamp: %s\n", err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
if ts > maxTS {
|
||||
maxTS = ts
|
||||
}
|
||||
}
|
||||
|
||||
interval, _ := time.ParseDuration(Keys.Checkpoints.Interval)
|
||||
updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix()
|
||||
|
||||
if startUp {
|
||||
return 0
|
||||
}
|
||||
|
||||
if updateTime < time.Now().Unix() {
|
||||
return 0
|
||||
}
|
||||
|
||||
return maxTS
|
||||
}
|
||||
|
||||
func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
// fmt.Printf("Checkpointing directory: %s\n", dir)
|
||||
// filepath contains the resolution
|
||||
intRes, _ := strconv.Atoi(path.Base(dir))
|
||||
|
||||
// find smallest overall timestamp in l.data map and delete it from l.data
|
||||
minTS := int64(1<<63 - 1)
|
||||
for ts, dat := range l.data {
|
||||
if ts < minTS && len(dat) != 0 {
|
||||
minTS = ts
|
||||
}
|
||||
}
|
||||
|
||||
if from == 0 && minTS != int64(1<<63-1) {
|
||||
from = minTS
|
||||
}
|
||||
|
||||
if from == 0 {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
var schema string
|
||||
var codec *goavro.Codec
|
||||
recordList := make([]map[string]any, 0)
|
||||
|
||||
var f *os.File
|
||||
|
||||
filePath := dir + fmt.Sprintf("_%d.avro", from)
|
||||
|
||||
var err error
|
||||
|
||||
fp_, err_ := os.Stat(filePath)
|
||||
if errors.Is(err_, os.ErrNotExist) {
|
||||
err = os.MkdirAll(path.Dir(dir), 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create directory: %v", err)
|
||||
}
|
||||
} else if fp_.Size() != 0 {
|
||||
f, err = os.Open(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open existing avro file: %v", err)
|
||||
}
|
||||
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
reader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF reader: %v", err)
|
||||
}
|
||||
codec = reader.Codec()
|
||||
schema = codec.Schema()
|
||||
|
||||
f.Close()
|
||||
}
|
||||
|
||||
timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix()
|
||||
|
||||
if dumpAll {
|
||||
timeRef = time.Now().Unix()
|
||||
}
|
||||
|
||||
// Empty values
|
||||
if len(l.data) == 0 {
|
||||
// we checkpoint avro files every 60 seconds
|
||||
repeat := 60 / intRes
|
||||
|
||||
for range repeat {
|
||||
recordList = append(recordList, make(map[string]any))
|
||||
}
|
||||
}
|
||||
|
||||
readFlag := true
|
||||
|
||||
for ts := range l.data {
|
||||
flag := false
|
||||
if ts < timeRef {
|
||||
data := l.data[ts]
|
||||
|
||||
schemaGen, err := generateSchema(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
flag, schema, err = compareSchema(schema, schemaGen)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compare read and generated schema: %v", err)
|
||||
}
|
||||
if flag && readFlag && !errors.Is(err_, os.ErrNotExist) {
|
||||
|
||||
f.Close()
|
||||
|
||||
f, err = os.Open(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open Avro file: %v", err)
|
||||
}
|
||||
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
ocfReader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF reader while changing schema: %v", err)
|
||||
}
|
||||
|
||||
for ocfReader.Scan() {
|
||||
record, err := ocfReader.Read()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read record: %v", err)
|
||||
}
|
||||
|
||||
recordList = append(recordList, record.(map[string]any))
|
||||
}
|
||||
|
||||
f.Close()
|
||||
|
||||
err = os.Remove(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete file: %v", err)
|
||||
}
|
||||
|
||||
readFlag = false
|
||||
}
|
||||
codec, err = goavro.NewCodec(schema)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create codec after merged schema: %v", err)
|
||||
}
|
||||
|
||||
recordList = append(recordList, generateRecord(data))
|
||||
delete(l.data, ts)
|
||||
}
|
||||
}
|
||||
|
||||
if len(recordList) == 0 {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to append new avro file: %v", err)
|
||||
}
|
||||
|
||||
// fmt.Printf("Codec : %#v\n", codec)
|
||||
|
||||
writer, err := goavro.NewOCFWriter(goavro.OCFConfig{
|
||||
W: f,
|
||||
Codec: codec,
|
||||
CompressionName: goavro.CompressionDeflateLabel,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF writer: %v", err)
|
||||
}
|
||||
|
||||
// Append the new record
|
||||
if err := writer.Append(recordList); err != nil {
|
||||
return fmt.Errorf("failed to append record: %v", err)
|
||||
}
|
||||
|
||||
f.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareSchema(schemaRead, schemaGen string) (bool, string, error) {
|
||||
var genSchema, readSchema AvroSchema
|
||||
|
||||
if schemaRead == "" {
|
||||
return false, schemaGen, nil
|
||||
}
|
||||
|
||||
// Unmarshal the schema strings into AvroSchema structs
|
||||
if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil {
|
||||
return false, "", fmt.Errorf("failed to parse generated schema: %v", err)
|
||||
}
|
||||
if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil {
|
||||
return false, "", fmt.Errorf("failed to parse read schema: %v", err)
|
||||
}
|
||||
|
||||
sort.Slice(genSchema.Fields, func(i, j int) bool {
|
||||
return genSchema.Fields[i].Name < genSchema.Fields[j].Name
|
||||
})
|
||||
|
||||
sort.Slice(readSchema.Fields, func(i, j int) bool {
|
||||
return readSchema.Fields[i].Name < readSchema.Fields[j].Name
|
||||
})
|
||||
|
||||
// Check if schemas are identical
|
||||
schemasEqual := true
|
||||
if len(genSchema.Fields) <= len(readSchema.Fields) {
|
||||
|
||||
for i := range genSchema.Fields {
|
||||
if genSchema.Fields[i].Name != readSchema.Fields[i].Name {
|
||||
schemasEqual = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// If schemas are identical, return the read schema
|
||||
if schemasEqual {
|
||||
return false, schemaRead, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Create a map to hold unique fields from both schemas
|
||||
fieldMap := make(map[string]AvroField)
|
||||
|
||||
// Add fields from the read schema
|
||||
for _, field := range readSchema.Fields {
|
||||
fieldMap[field.Name] = field
|
||||
}
|
||||
|
||||
// Add or update fields from the generated schema
|
||||
for _, field := range genSchema.Fields {
|
||||
fieldMap[field.Name] = field
|
||||
}
|
||||
|
||||
// Create a union schema by collecting fields from the map
|
||||
var mergedFields []AvroField
|
||||
for _, field := range fieldMap {
|
||||
mergedFields = append(mergedFields, field)
|
||||
}
|
||||
|
||||
// Sort fields by name for consistency
|
||||
sort.Slice(mergedFields, func(i, j int) bool {
|
||||
return mergedFields[i].Name < mergedFields[j].Name
|
||||
})
|
||||
|
||||
// Create the merged schema
|
||||
mergedSchema := AvroSchema{
|
||||
Type: "record",
|
||||
Name: genSchema.Name,
|
||||
Fields: mergedFields,
|
||||
}
|
||||
|
||||
// Check if schemas are identical
|
||||
schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields)
|
||||
if schemasEqual {
|
||||
for i := range mergedSchema.Fields {
|
||||
if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name {
|
||||
schemasEqual = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if schemasEqual {
|
||||
return false, schemaRead, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Marshal the merged schema back to JSON
|
||||
mergedSchemaJSON, err := json.Marshal(mergedSchema)
|
||||
if err != nil {
|
||||
return false, "", fmt.Errorf("failed to marshal merged schema: %v", err)
|
||||
}
|
||||
|
||||
return true, string(mergedSchemaJSON), nil
|
||||
}
|
||||
|
||||
func generateSchema(data map[string]schema.Float) (string, error) {
|
||||
// Define the Avro schema structure
|
||||
schema := map[string]any{
|
||||
"type": "record",
|
||||
"name": "DataRecord",
|
||||
"fields": []map[string]any{},
|
||||
}
|
||||
|
||||
fieldTracker := make(map[string]struct{})
|
||||
|
||||
for key := range data {
|
||||
if _, exists := fieldTracker[key]; !exists {
|
||||
key = correctKey(key)
|
||||
|
||||
field := map[string]any{
|
||||
"name": key,
|
||||
"type": "double",
|
||||
"default": -1.0,
|
||||
}
|
||||
schema["fields"] = append(schema["fields"].([]map[string]any), field)
|
||||
fieldTracker[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
schemaString, err := json.Marshal(schema)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to marshal schema: %v", err)
|
||||
}
|
||||
|
||||
return string(schemaString), nil
|
||||
}
|
||||
|
||||
func generateRecord(data map[string]schema.Float) map[string]any {
|
||||
record := make(map[string]any)
|
||||
|
||||
// Iterate through each map in data
|
||||
for key, value := range data {
|
||||
key = correctKey(key)
|
||||
|
||||
// Set the value in the record
|
||||
// avro only accepts basic types
|
||||
record[key] = value.Double()
|
||||
}
|
||||
|
||||
return record
|
||||
}
|
||||
|
||||
func correctKey(key string) string {
|
||||
key = strings.ReplaceAll(key, "_", "_0x5F_")
|
||||
key = strings.ReplaceAll(key, ":", "_0x3A_")
|
||||
key = strings.ReplaceAll(key, ".", "_0x2E_")
|
||||
return key
|
||||
}
|
||||
|
||||
func ReplaceKey(key string) string {
|
||||
key = strings.ReplaceAll(key, "_0x2E_", ".")
|
||||
key = strings.ReplaceAll(key, "_0x3A_", ":")
|
||||
key = strings.ReplaceAll(key, "_0x5F_", "_")
|
||||
return key
|
||||
}
|
||||
84
internal/memorystore/avroHelper.go
Normal file
84
internal/memorystore/avroHelper.go
Normal file
@@ -0,0 +1,84 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"slices"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
// AvroPool is a pool of Avro writers.
|
||||
go func() {
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
wg.Done() // Mark this goroutine as done
|
||||
return // Exit the goroutine
|
||||
}
|
||||
|
||||
defer wg.Done()
|
||||
|
||||
var avroLevel *AvroLevel
|
||||
oldSelector := make([]string, 0)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case val := <-LineProtocolMessages:
|
||||
// Fetch the frequency of the metric from the global configuration
|
||||
freq, err := GetMetricFrequency(val.MetricName)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error fetching metric frequency: %s\n", err)
|
||||
continue
|
||||
}
|
||||
|
||||
metricName := ""
|
||||
|
||||
for _, selectorName := range val.Selector {
|
||||
metricName += selectorName + SelectorDelimiter
|
||||
}
|
||||
|
||||
metricName += val.MetricName
|
||||
|
||||
// Create a new selector for the Avro level
|
||||
// The selector is a slice of strings that represents the path to the
|
||||
// Avro level. It is created by appending the cluster, node, and metric
|
||||
// name to the selector.
|
||||
var selector []string
|
||||
selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
|
||||
|
||||
if !stringSlicesEqual(oldSelector, selector) {
|
||||
// Get the Avro level for the metric
|
||||
avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
|
||||
|
||||
// If the Avro level is nil, create a new one
|
||||
if avroLevel == nil {
|
||||
cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
|
||||
}
|
||||
oldSelector = slices.Clone(selector)
|
||||
}
|
||||
|
||||
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func stringSlicesEqual(a, b []string) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i := range a {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
167
internal/memorystore/avroStruct.go
Normal file
167
internal/memorystore/avroStruct.go
Normal file
@@ -0,0 +1,167 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
LineProtocolMessages = make(chan *AvroStruct)
|
||||
// SelectorDelimiter separates hierarchical selector components in metric names for Avro encoding
|
||||
SelectorDelimiter = "_SEL_"
|
||||
)
|
||||
|
||||
var CheckpointBufferMinutes = DefaultCheckpointBufferMin
|
||||
|
||||
type AvroStruct struct {
|
||||
MetricName string
|
||||
Cluster string
|
||||
Node string
|
||||
Selector []string
|
||||
Value schema.Float
|
||||
Timestamp int64
|
||||
}
|
||||
|
||||
type AvroStore struct {
|
||||
root AvroLevel
|
||||
}
|
||||
|
||||
var avroStore AvroStore
|
||||
|
||||
type AvroLevel struct {
|
||||
children map[string]*AvroLevel
|
||||
data map[int64]map[string]schema.Float
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
type AvroField struct {
|
||||
Name string `json:"name"`
|
||||
Type any `json:"type"`
|
||||
Default any `json:"default,omitempty"`
|
||||
}
|
||||
|
||||
type AvroSchema struct {
|
||||
Type string `json:"type"`
|
||||
Name string `json:"name"`
|
||||
Fields []AvroField `json:"fields"`
|
||||
}
|
||||
|
||||
func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
// Allow concurrent reads:
|
||||
l.lock.RLock()
|
||||
var child *AvroLevel
|
||||
var ok bool
|
||||
if l.children == nil {
|
||||
// Children map needs to be created...
|
||||
l.lock.RUnlock()
|
||||
} else {
|
||||
child, ok := l.children[selector[0]]
|
||||
l.lock.RUnlock()
|
||||
if ok {
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
}
|
||||
|
||||
// The level does not exist, take write lock for unique access:
|
||||
l.lock.Lock()
|
||||
// While this thread waited for the write lock, another thread
|
||||
// could have created the child node.
|
||||
if l.children != nil {
|
||||
child, ok = l.children[selector[0]]
|
||||
if ok {
|
||||
l.lock.Unlock()
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
}
|
||||
|
||||
child = &AvroLevel{
|
||||
data: make(map[int64]map[string]schema.Float, 0),
|
||||
children: nil,
|
||||
}
|
||||
|
||||
if l.children != nil {
|
||||
l.children[selector[0]] = child
|
||||
} else {
|
||||
l.children = map[string]*AvroLevel{selector[0]: child}
|
||||
}
|
||||
l.lock.Unlock()
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
|
||||
func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
KeyCounter := int(CheckpointBufferMinutes * 60 / Freq)
|
||||
|
||||
// Create keys in advance for the given amount of time
|
||||
if len(l.data) != KeyCounter {
|
||||
if len(l.data) == 0 {
|
||||
for i := range KeyCounter {
|
||||
l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0)
|
||||
}
|
||||
} else {
|
||||
// Get the last timestamp
|
||||
var lastTS int64
|
||||
for ts := range l.data {
|
||||
if ts > lastTS {
|
||||
lastTS = ts
|
||||
}
|
||||
}
|
||||
// Create keys for the next KeyCounter timestamps
|
||||
l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0)
|
||||
}
|
||||
}
|
||||
|
||||
closestTS := int64(0)
|
||||
minDiff := int64(Freq) + 1 // Start with diff just outside the valid range
|
||||
found := false
|
||||
|
||||
// Iterate over timestamps and choose the one which is within range.
|
||||
// Since its epoch time, we check if the difference is less than 60 seconds.
|
||||
for ts, dat := range l.data {
|
||||
// Check if timestamp is within range
|
||||
diff := timestamp - ts
|
||||
if diff < -int64(Freq) || diff > int64(Freq) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Metric already present at this timestamp — skip
|
||||
if _, ok := dat[metricName]; ok {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if this is the closest timestamp so far
|
||||
if Abs(diff) < minDiff {
|
||||
minDiff = Abs(diff)
|
||||
closestTS = ts
|
||||
found = true
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
l.data[closestTS][metricName] = value
|
||||
}
|
||||
}
|
||||
|
||||
func GetAvroStore() *AvroStore {
|
||||
return &avroStore
|
||||
}
|
||||
|
||||
// Abs returns the absolute value of x.
|
||||
func Abs(x int64) int64 {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
190
internal/memorystore/buffer.go
Normal file
190
internal/memorystore/buffer.go
Normal file
@@ -0,0 +1,190 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// BufferCap is the default buffer capacity.
|
||||
// buffer.data will only ever grow up to its capacity and a new link
|
||||
// in the buffer chain will be created if needed so that no copying
|
||||
// of data or reallocation needs to happen on writes.
|
||||
const BufferCap int = DefaultBufferCapacity
|
||||
|
||||
var bufferPool sync.Pool = sync.Pool{
|
||||
New: func() any {
|
||||
return &buffer{
|
||||
data: make([]schema.Float, 0, BufferCap),
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
var (
|
||||
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
|
||||
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
|
||||
)
|
||||
|
||||
// Each metric on each level has it's own buffer.
|
||||
// This is where the actual values go.
|
||||
// If `cap(data)` is reached, a new buffer is created and
|
||||
// becomes the new head of a buffer list.
|
||||
type buffer struct {
|
||||
prev *buffer
|
||||
next *buffer
|
||||
data []schema.Float
|
||||
frequency int64
|
||||
start int64
|
||||
archived bool
|
||||
closed bool
|
||||
}
|
||||
|
||||
func newBuffer(ts, freq int64) *buffer {
|
||||
b := bufferPool.Get().(*buffer)
|
||||
b.frequency = freq
|
||||
b.start = ts - (freq / 2)
|
||||
b.prev = nil
|
||||
b.next = nil
|
||||
b.archived = false
|
||||
b.closed = false
|
||||
b.data = b.data[:0]
|
||||
return b
|
||||
}
|
||||
|
||||
// If a new buffer was created, the new head is returnd.
|
||||
// Otherwise, the existing buffer is returnd.
|
||||
// Normaly, only "newer" data should be written, but if the value would
|
||||
// end up in the same buffer anyways it is allowed.
|
||||
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
|
||||
if ts < b.start {
|
||||
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
|
||||
}
|
||||
|
||||
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
|
||||
idx := int((ts - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
newbuf := newBuffer(ts, b.frequency)
|
||||
newbuf.prev = b
|
||||
b.next = newbuf
|
||||
b = newbuf
|
||||
idx = 0
|
||||
}
|
||||
|
||||
// Overwriting value or writing value from past
|
||||
if idx < len(b.data) {
|
||||
b.data[idx] = value
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// Fill up unwritten slots with NaN
|
||||
for i := len(b.data); i < idx; i++ {
|
||||
b.data = append(b.data, schema.NaN)
|
||||
}
|
||||
|
||||
b.data = append(b.data, value)
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (b *buffer) end() int64 {
|
||||
return b.firstWrite() + int64(len(b.data))*b.frequency
|
||||
}
|
||||
|
||||
func (b *buffer) firstWrite() int64 {
|
||||
return b.start + (b.frequency / 2)
|
||||
}
|
||||
|
||||
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
|
||||
// Simple linear interpolation is done between the two neighboring cells if possible.
|
||||
// If values at the start or end are missing, instead of NaN values, the second and thrid
|
||||
// return values contain the actual `from`/`to`.
|
||||
// This function goes back the buffer chain if `from` is older than the currents buffer start.
|
||||
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
|
||||
// If `data` is not long enough to hold all values, this function will panic!
|
||||
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
|
||||
if from < b.firstWrite() {
|
||||
if b.prev != nil {
|
||||
return b.prev.read(from, to, data)
|
||||
}
|
||||
from = b.firstWrite()
|
||||
}
|
||||
|
||||
i := 0
|
||||
t := from
|
||||
for ; t < to; t += b.frequency {
|
||||
idx := int((t - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
if b.next == nil {
|
||||
break
|
||||
}
|
||||
b = b.next
|
||||
idx = 0
|
||||
}
|
||||
|
||||
if idx >= len(b.data) {
|
||||
if b.next == nil || to <= b.next.start {
|
||||
break
|
||||
}
|
||||
data[i] += schema.NaN
|
||||
} else if t < b.start {
|
||||
data[i] += schema.NaN
|
||||
} else {
|
||||
data[i] += b.data[idx]
|
||||
}
|
||||
i++
|
||||
}
|
||||
|
||||
return data[:i], from, t, nil
|
||||
}
|
||||
|
||||
// Returns true if this buffer needs to be freed.
|
||||
func (b *buffer) free(t int64) (delme bool, n int) {
|
||||
if b.prev != nil {
|
||||
delme, m := b.prev.free(t)
|
||||
n += m
|
||||
if delme {
|
||||
b.prev.next = nil
|
||||
if cap(b.prev.data) == BufferCap {
|
||||
bufferPool.Put(b.prev)
|
||||
}
|
||||
b.prev = nil
|
||||
}
|
||||
}
|
||||
|
||||
end := b.end()
|
||||
if end < t {
|
||||
return true, n + 1
|
||||
}
|
||||
|
||||
return false, n
|
||||
}
|
||||
|
||||
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
|
||||
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
|
||||
if b == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := b.prev.iterFromTo(from, to, callback); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if from <= b.end() && b.start <= to {
|
||||
return callback(b)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *buffer) count() int64 {
|
||||
res := int64(len(b.data))
|
||||
if b.prev != nil {
|
||||
res += b.prev.count()
|
||||
}
|
||||
return res
|
||||
}
|
||||
761
internal/memorystore/checkpoint.go
Normal file
761
internal/memorystore/checkpoint.go
Normal file
@@ -0,0 +1,761 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/linkedin/goavro/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
CheckpointFilePerms = 0o644
|
||||
CheckpointDirPerms = 0o755
|
||||
GCTriggerInterval = DefaultGCTriggerInterval
|
||||
)
|
||||
|
||||
// Whenever changed, update MarshalJSON as well!
|
||||
type CheckpointMetrics struct {
|
||||
Data []schema.Float `json:"data"`
|
||||
Frequency int64 `json:"frequency"`
|
||||
Start int64 `json:"start"`
|
||||
}
|
||||
|
||||
type CheckpointFile struct {
|
||||
Metrics map[string]*CheckpointMetrics `json:"metrics"`
|
||||
Children map[string]*CheckpointFile `json:"children"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
}
|
||||
|
||||
var lastCheckpoint time.Time
|
||||
|
||||
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
|
||||
lastCheckpoint = time.Now()
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
ms := GetMemoryStore()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.Checkpoints.Interval)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(d)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", lastCheckpoint.Format(time.RFC3339))
|
||||
now := time.Now()
|
||||
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
|
||||
lastCheckpoint.Unix(), now.Unix())
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n)
|
||||
lastCheckpoint = now
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
} else {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute):
|
||||
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(DefaultAvroCheckpointInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// As `Float` implements a custom MarshalJSON() function,
|
||||
// serializing an array of such types has more overhead
|
||||
// than one would assume (because of extra allocations, interfaces and so on).
|
||||
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
|
||||
buf := make([]byte, 0, 128+len(cm.Data)*8)
|
||||
buf = append(buf, `{"frequency":`...)
|
||||
buf = strconv.AppendInt(buf, cm.Frequency, 10)
|
||||
buf = append(buf, `,"start":`...)
|
||||
buf = strconv.AppendInt(buf, cm.Start, 10)
|
||||
buf = append(buf, `,"data":[`...)
|
||||
for i, x := range cm.Data {
|
||||
if i != 0 {
|
||||
buf = append(buf, ',')
|
||||
}
|
||||
if x.IsNaN() {
|
||||
buf = append(buf, `null`...)
|
||||
} else {
|
||||
buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32)
|
||||
}
|
||||
}
|
||||
buf = append(buf, `]}`...)
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
|
||||
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
|
||||
// The good thing: Only a host at a time is locked, so this function can run
|
||||
// in parallel to writes/reads.
|
||||
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
|
||||
levels := make([]*Level, 0)
|
||||
selectors := make([][]string, 0)
|
||||
m.root.lock.RLock()
|
||||
for sel1, l1 := range m.root.children {
|
||||
l1.lock.RLock()
|
||||
for sel2, l2 := range l1.children {
|
||||
levels = append(levels, l2)
|
||||
selectors = append(selectors, []string{sel1, sel2})
|
||||
}
|
||||
l1.lock.RUnlock()
|
||||
}
|
||||
m.root.lock.RUnlock()
|
||||
|
||||
type workItem struct {
|
||||
level *Level
|
||||
dir string
|
||||
selector []string
|
||||
}
|
||||
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(Keys.NumWorkers)
|
||||
work := make(chan workItem, Keys.NumWorkers*2)
|
||||
for worker := 0; worker < Keys.NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
for workItem := range work {
|
||||
if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
|
||||
if err == ErrNoNewArchiveData {
|
||||
continue
|
||||
}
|
||||
|
||||
cclog.Errorf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
} else {
|
||||
atomic.AddInt32(&n, 1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for i := 0; i < len(levels); i++ {
|
||||
dir := path.Join(dir, path.Join(selectors[i]...))
|
||||
work <- workItem{
|
||||
level: levels[i],
|
||||
dir: dir,
|
||||
selector: selectors[i],
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
retval := &CheckpointFile{
|
||||
From: from,
|
||||
To: to,
|
||||
Metrics: make(map[string]*CheckpointMetrics),
|
||||
Children: make(map[string]*CheckpointFile),
|
||||
}
|
||||
|
||||
for metric, minfo := range m.Metrics {
|
||||
b := l.metrics[minfo.offset]
|
||||
if b == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
allArchived := true
|
||||
b.iterFromTo(from, to, func(b *buffer) error {
|
||||
if !b.archived {
|
||||
allArchived = false
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if allArchived {
|
||||
continue
|
||||
}
|
||||
|
||||
data := make([]schema.Float, (to-from)/b.frequency+1)
|
||||
data, start, end, err := b.read(from, to, data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := int((end - start) / b.frequency); i < len(data); i++ {
|
||||
data[i] = schema.NaN
|
||||
}
|
||||
|
||||
retval.Metrics[metric] = &CheckpointMetrics{
|
||||
Frequency: b.frequency,
|
||||
Start: start,
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
for name, child := range l.children {
|
||||
val, err := child.toCheckpointFile(from, to, m)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if val != nil {
|
||||
retval.Children[name] = val
|
||||
}
|
||||
}
|
||||
|
||||
if len(retval.Children) == 0 && len(retval.Metrics) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return retval, nil
|
||||
}
|
||||
|
||||
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
|
||||
cf, err := l.toCheckpointFile(from, to, m)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cf == nil {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
|
||||
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
err = os.MkdirAll(dir, CheckpointDirPerms)
|
||||
if err == nil {
|
||||
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
bw := bufio.NewWriter(f)
|
||||
if err = json.NewEncoder(bw).Encode(cf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return bw.Flush()
|
||||
}
|
||||
|
||||
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
|
||||
var wg sync.WaitGroup
|
||||
work := make(chan [2]string, Keys.NumWorkers)
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
wg.Add(Keys.NumWorkers)
|
||||
for worker := 0; worker < Keys.NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for host := range work {
|
||||
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
|
||||
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> error while loading checkpoints for %s/%s: %s", host[0], host[1], err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
}
|
||||
atomic.AddInt32(&n, int32(nn))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
i := 0
|
||||
clustersDir, err := os.ReadDir(dir)
|
||||
for _, clusterDir := range clustersDir {
|
||||
if !clusterDir.IsDir() {
|
||||
err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
|
||||
goto done
|
||||
}
|
||||
|
||||
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
|
||||
if e != nil {
|
||||
err = e
|
||||
goto done
|
||||
}
|
||||
|
||||
for _, hostDir := range hostsDir {
|
||||
if !hostDir.IsDir() {
|
||||
err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
|
||||
goto done
|
||||
}
|
||||
|
||||
i++
|
||||
if i%Keys.NumWorkers == 0 && i > GCTriggerInterval {
|
||||
// Forcing garbage collection runs here regulary during the loading of checkpoints
|
||||
// will decrease the total heap size after loading everything back to memory is done.
|
||||
// While loading data, the heap will grow fast, so the GC target size will double
|
||||
// almost always. By forcing GCs here, we can keep it growing more slowly so that
|
||||
// at the end, less memory is wasted.
|
||||
runtime.GC()
|
||||
}
|
||||
|
||||
work <- [2]string{clusterDir.Name(), hostDir.Name()}
|
||||
}
|
||||
}
|
||||
done:
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return int(n), err
|
||||
}
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
|
||||
// This function can only be called once and before the very first write or read.
|
||||
// Different host's data is loaded to memory in parallel.
|
||||
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
|
||||
if _, err := os.Stat(dir); os.IsNotExist(err) {
|
||||
// The directory does not exist, so create it using os.MkdirAll()
|
||||
err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
|
||||
}
|
||||
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
|
||||
}
|
||||
|
||||
// Config read (replace with your actual config read)
|
||||
fileFormat := Keys.Checkpoints.FileFormat
|
||||
if fileFormat == "" {
|
||||
fileFormat = "avro"
|
||||
}
|
||||
|
||||
// Map to easily get the fallback format
|
||||
oppositeFormat := map[string]string{
|
||||
"json": "avro",
|
||||
"avro": "json",
|
||||
}
|
||||
|
||||
// First, attempt to load the specified format
|
||||
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, fileFormat)
|
||||
}
|
||||
|
||||
// If not found, attempt the opposite format
|
||||
altFormat := oppositeFormat[fileFormat]
|
||||
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, altFormat)
|
||||
}
|
||||
|
||||
cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory")
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func checkFilesWithExtension(dir string, extension string) (bool, error) {
|
||||
found := false
|
||||
|
||||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
|
||||
}
|
||||
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
|
||||
found = true
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
|
||||
}
|
||||
|
||||
return found, nil
|
||||
}
|
||||
|
||||
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:]
|
||||
resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err)
|
||||
}
|
||||
|
||||
fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64)
|
||||
|
||||
// Same logic according to lineprotocol
|
||||
fromTimestamp -= (resolution / 2)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err)
|
||||
}
|
||||
|
||||
// fmt.Printf("File : %s with resolution : %d\n", fileName, resolution)
|
||||
|
||||
var recordCounter int64 = 0
|
||||
|
||||
// Create a new OCF reader from the buffered reader
|
||||
ocfReader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error creating OCF reader: %w", err)
|
||||
}
|
||||
|
||||
metricsData := make(map[string]schema.FloatArray)
|
||||
|
||||
for ocfReader.Scan() {
|
||||
datum, err := ocfReader.Read()
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err)
|
||||
}
|
||||
|
||||
record, ok := datum.(map[string]any)
|
||||
if !ok {
|
||||
return fmt.Errorf("[METRICSTORE]> failed to assert datum as map[string]interface{}")
|
||||
}
|
||||
|
||||
for key, value := range record {
|
||||
metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64)))
|
||||
}
|
||||
|
||||
recordCounter += 1
|
||||
}
|
||||
|
||||
to := (fromTimestamp + (recordCounter / (60 / resolution) * 60))
|
||||
if to < from {
|
||||
return nil
|
||||
}
|
||||
|
||||
for key, floatArray := range metricsData {
|
||||
metricName := ReplaceKey(key)
|
||||
|
||||
if strings.Contains(metricName, SelectorDelimiter) {
|
||||
subString := strings.Split(metricName, SelectorDelimiter)
|
||||
|
||||
lvl := l
|
||||
|
||||
for i := 0; i < len(subString)-1; i++ {
|
||||
|
||||
sel := subString[i]
|
||||
|
||||
if lvl.children == nil {
|
||||
lvl.children = make(map[string]*Level)
|
||||
}
|
||||
|
||||
child, ok := lvl.children[sel]
|
||||
if !ok {
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: nil,
|
||||
}
|
||||
lvl.children[sel] = child
|
||||
}
|
||||
lvl = child
|
||||
}
|
||||
|
||||
leafMetricName := subString[len(subString)-1]
|
||||
err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
|
||||
}
|
||||
} else {
|
||||
err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error {
|
||||
n := len(floatArray)
|
||||
b := &buffer{
|
||||
frequency: resolution,
|
||||
start: from,
|
||||
data: floatArray[0:n:n],
|
||||
prev: nil,
|
||||
next: nil,
|
||||
archived: true,
|
||||
}
|
||||
|
||||
minfo, ok := m.Metrics[metricName]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
prev := l.metrics[minfo.offset]
|
||||
if prev == nil {
|
||||
l.metrics[minfo.offset] = b
|
||||
} else {
|
||||
if prev.start > b.start {
|
||||
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
|
||||
}
|
||||
|
||||
b.prev = prev
|
||||
prev.next = b
|
||||
|
||||
missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency))
|
||||
if missingCount > 0 {
|
||||
missingCount /= int(b.frequency)
|
||||
|
||||
for range missingCount {
|
||||
prev.data = append(prev.data, schema.NaN)
|
||||
}
|
||||
|
||||
prev.data = prev.data[0:len(prev.data):len(prev.data)]
|
||||
}
|
||||
}
|
||||
l.metrics[minfo.offset] = b
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
|
||||
br := bufio.NewReader(f)
|
||||
cf := &CheckpointFile{}
|
||||
if err := json.NewDecoder(br).Decode(cf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cf.To != 0 && cf.To < from {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := l.loadFile(cf, m); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
|
||||
for name, metric := range cf.Metrics {
|
||||
n := len(metric.Data)
|
||||
b := &buffer{
|
||||
frequency: metric.Frequency,
|
||||
start: metric.Start,
|
||||
data: metric.Data[0:n:n],
|
||||
prev: nil,
|
||||
next: nil,
|
||||
archived: true,
|
||||
}
|
||||
|
||||
minfo, ok := m.Metrics[name]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
prev := l.metrics[minfo.offset]
|
||||
if prev == nil {
|
||||
l.metrics[minfo.offset] = b
|
||||
} else {
|
||||
if prev.start > b.start {
|
||||
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
|
||||
}
|
||||
|
||||
b.prev = prev
|
||||
prev.next = b
|
||||
}
|
||||
l.metrics[minfo.offset] = b
|
||||
}
|
||||
|
||||
if len(cf.Children) > 0 && l.children == nil {
|
||||
l.children = make(map[string]*Level)
|
||||
}
|
||||
|
||||
for sel, childCf := range cf.Children {
|
||||
child, ok := l.children[sel]
|
||||
if !ok {
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: nil,
|
||||
}
|
||||
l.children[sel] = child
|
||||
}
|
||||
|
||||
if err := child.loadFile(childCf, m); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
|
||||
direntries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return 0, err
|
||||
}
|
||||
|
||||
allFiles := make([]fs.DirEntry, 0)
|
||||
filesLoaded := 0
|
||||
for _, e := range direntries {
|
||||
if e.IsDir() {
|
||||
child := &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: make(map[string]*Level),
|
||||
}
|
||||
|
||||
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
|
||||
filesLoaded += files
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
l.children[e.Name()] = child
|
||||
} else if strings.HasSuffix(e.Name(), "."+extension) {
|
||||
allFiles = append(allFiles, e)
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
files, err := findFiles(allFiles, from, extension, true)
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
|
||||
"json": l.loadJSONFile,
|
||||
"avro": l.loadAvroFile,
|
||||
}
|
||||
|
||||
loader := loaders[extension]
|
||||
|
||||
for _, filename := range files {
|
||||
// Use a closure to ensure file is closed immediately after use
|
||||
err := func() error {
|
||||
f, err := os.Open(path.Join(dir, filename))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return loader(m, f, from)
|
||||
}()
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
filesLoaded += 1
|
||||
}
|
||||
|
||||
return filesLoaded, nil
|
||||
}
|
||||
|
||||
// This will probably get very slow over time!
|
||||
// A solution could be some sort of an index file in which all other files
|
||||
// and the timespan they contain is listed.
|
||||
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
|
||||
nums := map[string]int64{}
|
||||
for _, e := range direntries {
|
||||
if !strings.HasSuffix(e.Name(), "."+extension) {
|
||||
continue
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nums[e.Name()] = ts
|
||||
}
|
||||
|
||||
sort.Slice(direntries, func(i, j int) bool {
|
||||
a, b := direntries[i], direntries[j]
|
||||
return nums[a.Name()] < nums[b.Name()]
|
||||
})
|
||||
|
||||
filenames := make([]string, 0)
|
||||
for i := range direntries {
|
||||
e := direntries[i]
|
||||
ts1 := nums[e.Name()]
|
||||
|
||||
if findMoreRecentFiles && t <= ts1 {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
if i == len(direntries)-1 {
|
||||
continue
|
||||
}
|
||||
|
||||
enext := direntries[i+1]
|
||||
ts2 := nums[enext.Name()]
|
||||
|
||||
if findMoreRecentFiles {
|
||||
if ts1 < t && t < ts2 {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
} else {
|
||||
if ts2 < t {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return filenames, nil
|
||||
}
|
||||
115
internal/memorystore/config.go
Normal file
115
internal/memorystore/config.go
Normal file
@@ -0,0 +1,115 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultMaxWorkers = 10
|
||||
DefaultBufferCapacity = 512
|
||||
DefaultGCTriggerInterval = 100
|
||||
DefaultAvroWorkers = 4
|
||||
DefaultCheckpointBufferMin = 3
|
||||
DefaultAvroCheckpointInterval = time.Minute
|
||||
)
|
||||
|
||||
type MetricStoreConfig struct {
|
||||
// Number of concurrent workers for checkpoint and archive operations.
|
||||
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
|
||||
NumWorkers int `json:"num-workers"`
|
||||
Checkpoints struct {
|
||||
FileFormat string `json:"file-format"`
|
||||
Interval string `json:"interval"`
|
||||
RootDir string `json:"directory"`
|
||||
Restore string `json:"restore"`
|
||||
} `json:"checkpoints"`
|
||||
Debug struct {
|
||||
DumpToFile string `json:"dump-to-file"`
|
||||
EnableGops bool `json:"gops"`
|
||||
} `json:"debug"`
|
||||
RetentionInMemory string `json:"retention-in-memory"`
|
||||
Archive struct {
|
||||
Interval string `json:"interval"`
|
||||
RootDir string `json:"directory"`
|
||||
DeleteInstead bool `json:"delete-instead"`
|
||||
} `json:"archive"`
|
||||
Subscriptions []struct {
|
||||
// Channel name
|
||||
SubscribeTo string `json:"subscribe-to"`
|
||||
|
||||
// Allow lines without a cluster tag, use this as default, optional
|
||||
ClusterTag string `json:"cluster-tag"`
|
||||
} `json:"subscriptions"`
|
||||
}
|
||||
|
||||
var Keys MetricStoreConfig
|
||||
|
||||
// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time!
|
||||
type AggregationStrategy int
|
||||
|
||||
const (
|
||||
NoAggregation AggregationStrategy = iota
|
||||
SumAggregation
|
||||
AvgAggregation
|
||||
)
|
||||
|
||||
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
|
||||
switch str {
|
||||
case "":
|
||||
return NoAggregation, nil
|
||||
case "sum":
|
||||
return SumAggregation, nil
|
||||
case "avg":
|
||||
return AvgAggregation, nil
|
||||
default:
|
||||
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
|
||||
}
|
||||
}
|
||||
|
||||
type MetricConfig struct {
|
||||
// Interval in seconds at which measurements are stored
|
||||
Frequency int64
|
||||
|
||||
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
|
||||
Aggregation AggregationStrategy
|
||||
|
||||
// Private, used internally...
|
||||
offset int
|
||||
}
|
||||
|
||||
var Metrics map[string]MetricConfig
|
||||
|
||||
func GetMetricFrequency(metricName string) (int64, error) {
|
||||
if metric, ok := Metrics[metricName]; ok {
|
||||
return metric.Frequency, nil
|
||||
}
|
||||
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
|
||||
}
|
||||
|
||||
// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency.
|
||||
// use metric.Name to check if the metric already exists.
|
||||
// if not, add it to the Metrics map.
|
||||
func AddMetric(name string, metric MetricConfig) error {
|
||||
if Metrics == nil {
|
||||
Metrics = make(map[string]MetricConfig, 0)
|
||||
}
|
||||
|
||||
if existingMetric, ok := Metrics[name]; ok {
|
||||
if existingMetric.Frequency != metric.Frequency {
|
||||
if existingMetric.Frequency < metric.Frequency {
|
||||
existingMetric.Frequency = metric.Frequency
|
||||
Metrics[name] = existingMetric
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Metrics[name] = metric
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
95
internal/memorystore/configSchema.go
Normal file
95
internal/memorystore/configSchema.go
Normal file
@@ -0,0 +1,95 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
const configSchema = `{
|
||||
"type": "object",
|
||||
"description": "Configuration specific to built-in metric-store.",
|
||||
"properties": {
|
||||
"checkpoints": {
|
||||
"description": "Configuration for checkpointing the metrics within metric-store",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file-format": {
|
||||
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
|
||||
"type": "string"
|
||||
},
|
||||
"interval": {
|
||||
"description": "Interval at which the metrics should be checkpointed.",
|
||||
"type": "string"
|
||||
},
|
||||
"directory": {
|
||||
"description": "Specify the parent directy in which the checkpointed files should be placed.",
|
||||
"type": "string"
|
||||
},
|
||||
"restore": {
|
||||
"description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"archive": {
|
||||
"description": "Configuration for archiving the already checkpointed files.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"interval": {
|
||||
"description": "Interval at which the checkpointed files should be archived.",
|
||||
"type": "string"
|
||||
},
|
||||
"directory": {
|
||||
"description": "Specify the parent directy in which the archived files should be placed.",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"retention-in-memory": {
|
||||
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
|
||||
"type": "string"
|
||||
},
|
||||
"nats": {
|
||||
"description": "Configuration for accepting published data through NATS.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"address": {
|
||||
"description": "Address of the NATS server.",
|
||||
"type": "string"
|
||||
},
|
||||
"username": {
|
||||
"description": "Optional: If configured with username/password method.",
|
||||
"type": "string"
|
||||
},
|
||||
"password": {
|
||||
"description": "Optional: If configured with username/password method.",
|
||||
"type": "string"
|
||||
},
|
||||
"creds-file-path": {
|
||||
"description": "Optional: If configured with Credential File method. Path to your NATS cred file.",
|
||||
"type": "string"
|
||||
},
|
||||
"subscriptions": {
|
||||
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"subscribe-to": {
|
||||
"description": "Channel name",
|
||||
"type": "string"
|
||||
},
|
||||
"cluster-tag": {
|
||||
"description": "Optional: Allow lines without a cluster tag, use this as default",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
112
internal/memorystore/debug.go
Normal file
112
internal/memorystore/debug.go
Normal file
@@ -0,0 +1,112 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func (b *buffer) debugDump(buf []byte) []byte {
|
||||
if b.prev != nil {
|
||||
buf = b.prev.debugDump(buf)
|
||||
}
|
||||
|
||||
start, len, end := b.start, len(b.data), b.start+b.frequency*int64(len(b.data))
|
||||
buf = append(buf, `{"start":`...)
|
||||
buf = strconv.AppendInt(buf, start, 10)
|
||||
buf = append(buf, `,"len":`...)
|
||||
buf = strconv.AppendInt(buf, int64(len), 10)
|
||||
buf = append(buf, `,"end":`...)
|
||||
buf = strconv.AppendInt(buf, end, 10)
|
||||
if b.archived {
|
||||
buf = append(buf, `,"saved":true`...)
|
||||
}
|
||||
if b.next != nil {
|
||||
buf = append(buf, `},`...)
|
||||
} else {
|
||||
buf = append(buf, `}`...)
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
for i := 0; i < depth; i++ {
|
||||
buf = append(buf, '\t')
|
||||
}
|
||||
buf = append(buf, '"')
|
||||
buf = append(buf, lvlname...)
|
||||
buf = append(buf, "\":{\n"...)
|
||||
depth += 1
|
||||
objitems := 0
|
||||
for name, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.offset]; b != nil {
|
||||
for i := 0; i < depth; i++ {
|
||||
buf = append(buf, '\t')
|
||||
}
|
||||
|
||||
buf = append(buf, '"')
|
||||
buf = append(buf, name...)
|
||||
buf = append(buf, `":[`...)
|
||||
buf = b.debugDump(buf)
|
||||
buf = append(buf, "],\n"...)
|
||||
objitems++
|
||||
}
|
||||
}
|
||||
|
||||
for name, lvl := range l.children {
|
||||
_, err := w.Write(buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
buf = buf[0:0]
|
||||
buf, err = lvl.debugDump(m, w, name, buf, depth)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
buf = append(buf, ',', '\n')
|
||||
objitems++
|
||||
}
|
||||
|
||||
// remove final `,`:
|
||||
if objitems > 0 {
|
||||
buf = append(buf[0:len(buf)-1], '\n')
|
||||
}
|
||||
|
||||
depth -= 1
|
||||
for i := 0; i < depth; i++ {
|
||||
buf = append(buf, '\t')
|
||||
}
|
||||
buf = append(buf, '}')
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) DebugDump(w *bufio.Writer, selector []string) error {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
|
||||
}
|
||||
|
||||
buf := make([]byte, 0, 2048)
|
||||
buf = append(buf, "{"...)
|
||||
|
||||
buf, err := lvl.debugDump(m, w, "data", buf, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
buf = append(buf, "}\n"...)
|
||||
if _, err = w.Write(buf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return w.Flush()
|
||||
}
|
||||
92
internal/memorystore/healthcheck.go
Normal file
92
internal/memorystore/healthcheck.go
Normal file
@@ -0,0 +1,92 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
|
||||
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
|
||||
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
|
||||
const MaxMissingDataPoints int64 = 5
|
||||
|
||||
// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly.
|
||||
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
|
||||
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
|
||||
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
|
||||
const MaxUnhealthyMetrics int64 = 5
|
||||
|
||||
func (b *buffer) healthCheck() int64 {
|
||||
// Check if the buffer is empty
|
||||
if b.data == nil {
|
||||
return 1
|
||||
}
|
||||
|
||||
bufferEnd := b.start + b.frequency*int64(len(b.data))
|
||||
t := time.Now().Unix()
|
||||
|
||||
// Check if the buffer is too old
|
||||
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
|
||||
return 1
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
for _, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.offset]; b != nil {
|
||||
count += b.healthCheck()
|
||||
}
|
||||
}
|
||||
|
||||
for _, lvl := range l.children {
|
||||
c, err := lvl.healthCheck(m, 0)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
count += c
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
|
||||
}
|
||||
|
||||
buf := make([]byte, 0, 25)
|
||||
// buf = append(buf, "{"...)
|
||||
|
||||
var count int64 = 0
|
||||
|
||||
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if unhealthyMetricsCount < MaxUnhealthyMetrics {
|
||||
buf = append(buf, "Healthy"...)
|
||||
} else {
|
||||
buf = append(buf, "Unhealthy"...)
|
||||
}
|
||||
|
||||
// buf = append(buf, "}\n"...)
|
||||
|
||||
if _, err = w.Write(buf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return w.Flush()
|
||||
}
|
||||
192
internal/memorystore/level.go
Normal file
192
internal/memorystore/level.go
Normal file
@@ -0,0 +1,192 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
// Could also be called "node" as this forms a node in a tree structure.
|
||||
// Called Level because "node" might be confusing here.
|
||||
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
|
||||
// also hold data (in `metrics`).
|
||||
type Level struct {
|
||||
children map[string]*Level
|
||||
metrics []*buffer
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
// Find the correct level for the given selector, creating it if
|
||||
// it does not exist. Example selector in the context of the
|
||||
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
|
||||
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
|
||||
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
// Allow concurrent reads:
|
||||
l.lock.RLock()
|
||||
var child *Level
|
||||
var ok bool
|
||||
if l.children == nil {
|
||||
// Children map needs to be created...
|
||||
l.lock.RUnlock()
|
||||
} else {
|
||||
child, ok = l.children[selector[0]]
|
||||
l.lock.RUnlock()
|
||||
if ok {
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
// The level does not exist, take write lock for unique access:
|
||||
l.lock.Lock()
|
||||
// While this thread waited for the write lock, another thread
|
||||
// could have created the child node.
|
||||
if l.children != nil {
|
||||
child, ok = l.children[selector[0]]
|
||||
if ok {
|
||||
l.lock.Unlock()
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, nMetrics),
|
||||
children: nil,
|
||||
}
|
||||
|
||||
if l.children != nil {
|
||||
l.children[selector[0]] = child
|
||||
} else {
|
||||
l.children = map[string]*Level{selector[0]: child}
|
||||
}
|
||||
l.lock.Unlock()
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
|
||||
func (l *Level) free(t int64) (int, error) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
n := 0
|
||||
for i, b := range l.metrics {
|
||||
if b != nil {
|
||||
delme, m := b.free(t)
|
||||
n += m
|
||||
if delme {
|
||||
if cap(b.data) == BufferCap {
|
||||
bufferPool.Put(b)
|
||||
}
|
||||
l.metrics[i] = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range l.children {
|
||||
m, err := l.free(t)
|
||||
n += m
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (l *Level) sizeInBytes() int64 {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
size := int64(0)
|
||||
|
||||
for _, b := range l.metrics {
|
||||
if b != nil {
|
||||
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
|
||||
}
|
||||
}
|
||||
|
||||
for _, child := range l.children {
|
||||
size += child.sizeInBytes()
|
||||
}
|
||||
|
||||
return size
|
||||
}
|
||||
|
||||
func (l *Level) findLevel(selector []string) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
lvl := l.children[selector[0]]
|
||||
if lvl == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return lvl.findLevel(selector[1:])
|
||||
}
|
||||
|
||||
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
if len(selector) == 0 {
|
||||
b := l.metrics[offset]
|
||||
if b != nil {
|
||||
return f(b)
|
||||
}
|
||||
|
||||
for _, lvl := range l.children {
|
||||
err := lvl.findBuffers(nil, offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
sel := selector[0]
|
||||
if len(sel.String) != 0 && l.children != nil {
|
||||
lvl, ok := l.children[sel.String]
|
||||
if ok {
|
||||
err := lvl.findBuffers(selector[1:], offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if sel.Group != nil && l.children != nil {
|
||||
for _, key := range sel.Group {
|
||||
lvl, ok := l.children[key]
|
||||
if ok {
|
||||
err := lvl.findBuffers(selector[1:], offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if sel.Any && l.children != nil {
|
||||
for _, lvl := range l.children {
|
||||
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
258
internal/memorystore/lineprotocol.go
Normal file
258
internal/memorystore/lineprotocol.go
Normal file
@@ -0,0 +1,258 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/nats"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/influxdata/line-protocol/v2/lineprotocol"
|
||||
)
|
||||
|
||||
func ReceiveNats(ms *MemoryStore,
|
||||
workers int,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
nc := nats.GetClient()
|
||||
|
||||
if nc == nil {
|
||||
cclog.Warn("NATS client not initialized")
|
||||
return nil
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
msgs := make(chan []byte, workers*2)
|
||||
|
||||
for _, sc := range Keys.Subscriptions {
|
||||
clusterTag := sc.ClusterTag
|
||||
if workers > 1 {
|
||||
wg.Add(workers)
|
||||
|
||||
for range workers {
|
||||
go func() {
|
||||
for m := range msgs {
|
||||
dec := lineprotocol.NewDecoderWithBytes(m)
|
||||
if err := DecodeLine(dec, ms, clusterTag); err != nil {
|
||||
cclog.Errorf("error: %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
|
||||
nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) {
|
||||
msgs <- data
|
||||
})
|
||||
} else {
|
||||
nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) {
|
||||
dec := lineprotocol.NewDecoderWithBytes(data)
|
||||
if err := DecodeLine(dec, ms, clusterTag); err != nil {
|
||||
cclog.Errorf("error: %s", err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
cclog.Infof("NATS subscription to '%s' established", sc.SubscribeTo)
|
||||
}
|
||||
|
||||
close(msgs)
|
||||
wg.Wait()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Place `prefix` in front of `buf` but if possible,
|
||||
// do that inplace in `buf`.
|
||||
func reorder(buf, prefix []byte) []byte {
|
||||
n := len(prefix)
|
||||
m := len(buf)
|
||||
if cap(buf) < m+n {
|
||||
return append(prefix[:n:n], buf...)
|
||||
} else {
|
||||
buf = buf[:n+m]
|
||||
for i := m - 1; i >= 0; i-- {
|
||||
buf[i+n] = buf[i]
|
||||
}
|
||||
for i := range n {
|
||||
buf[i] = prefix[i]
|
||||
}
|
||||
return buf
|
||||
}
|
||||
}
|
||||
|
||||
// Decode lines using dec and make write calls to the MemoryStore.
|
||||
// If a line is missing its cluster tag, use clusterDefault as default.
|
||||
func DecodeLine(dec *lineprotocol.Decoder,
|
||||
ms *MemoryStore,
|
||||
clusterDefault string,
|
||||
) error {
|
||||
// Reduce allocations in loop:
|
||||
t := time.Now()
|
||||
metric, metricBuf := Metric{}, make([]byte, 0, 16)
|
||||
selector := make([]string, 0, 4)
|
||||
typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0)
|
||||
|
||||
// Optimize for the case where all lines in a "batch" are about the same
|
||||
// cluster and host. By using `WriteToLevel` (level = host), we do not need
|
||||
// to take the root- and cluster-level lock as often.
|
||||
var lvl *Level = nil
|
||||
prevCluster, prevHost := "", ""
|
||||
|
||||
var ok bool
|
||||
for dec.Next() {
|
||||
rawmeasurement, err := dec.Measurement()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Needs to be copied because another call to dec.* would
|
||||
// invalidate the returned slice.
|
||||
metricBuf = append(metricBuf[:0], rawmeasurement...)
|
||||
|
||||
// The go compiler optimizes map[string(byteslice)] lookups:
|
||||
metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0]
|
||||
cluster, host := clusterDefault, ""
|
||||
for {
|
||||
key, val, err := dec.NextTag()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if key == nil {
|
||||
break
|
||||
}
|
||||
|
||||
// The go compiler optimizes string([]byte{...}) == "...":
|
||||
switch string(key) {
|
||||
case "cluster":
|
||||
if string(val) == prevCluster {
|
||||
cluster = prevCluster
|
||||
} else {
|
||||
cluster = string(val)
|
||||
lvl = nil
|
||||
}
|
||||
case "hostname", "host":
|
||||
if string(val) == prevHost {
|
||||
host = prevHost
|
||||
} else {
|
||||
host = string(val)
|
||||
lvl = nil
|
||||
}
|
||||
case "type":
|
||||
if string(val) == "node" {
|
||||
break
|
||||
}
|
||||
|
||||
// We cannot be sure that the "type" tag comes before the "type-id" tag:
|
||||
if len(typeBuf) == 0 {
|
||||
typeBuf = append(typeBuf, val...)
|
||||
} else {
|
||||
typeBuf = reorder(typeBuf, val)
|
||||
}
|
||||
case "type-id":
|
||||
typeBuf = append(typeBuf, val...)
|
||||
case "subtype":
|
||||
// We cannot be sure that the "subtype" tag comes before the "stype-id" tag:
|
||||
if len(subTypeBuf) == 0 {
|
||||
subTypeBuf = append(subTypeBuf, val...)
|
||||
} else {
|
||||
subTypeBuf = reorder(subTypeBuf, val)
|
||||
// subTypeBuf = reorder(typeBuf, val)
|
||||
}
|
||||
case "stype-id":
|
||||
subTypeBuf = append(subTypeBuf, val...)
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// If the cluster or host changed, the lvl was set to nil
|
||||
if lvl == nil {
|
||||
selector = selector[:2]
|
||||
selector[0], selector[1] = cluster, host
|
||||
lvl = ms.GetLevel(selector)
|
||||
prevCluster, prevHost = cluster, host
|
||||
}
|
||||
|
||||
// subtypes:
|
||||
selector = selector[:0]
|
||||
if len(typeBuf) > 0 {
|
||||
selector = append(selector, string(typeBuf)) // <- Allocation :(
|
||||
if len(subTypeBuf) > 0 {
|
||||
selector = append(selector, string(subTypeBuf))
|
||||
}
|
||||
}
|
||||
|
||||
for {
|
||||
key, val, err := dec.NextField()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if key == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if string(key) != "value" {
|
||||
return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val)
|
||||
}
|
||||
|
||||
if val.Kind() == lineprotocol.Float {
|
||||
metric.Value = schema.Float(val.FloatV())
|
||||
} else if val.Kind() == lineprotocol.Int {
|
||||
metric.Value = schema.Float(val.IntV())
|
||||
} else if val.Kind() == lineprotocol.Uint {
|
||||
metric.Value = schema.Float(val.UintV())
|
||||
} else {
|
||||
return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String())
|
||||
}
|
||||
}
|
||||
|
||||
if t, err = dec.Time(lineprotocol.Second, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil {
|
||||
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
|
||||
}
|
||||
|
||||
time := t.Unix()
|
||||
|
||||
if Keys.Checkpoints.FileFormat != "json" {
|
||||
LineProtocolMessages <- &AvroStruct{
|
||||
MetricName: string(metricBuf),
|
||||
Cluster: cluster,
|
||||
Node: host,
|
||||
Selector: append([]string{}, selector...),
|
||||
Value: metric.Value,
|
||||
Timestamp: time,
|
||||
}
|
||||
}
|
||||
|
||||
if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
429
internal/memorystore/memorystore.go
Normal file
429
internal/memorystore/memorystore.go
Normal file
@@ -0,0 +1,429 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package memorystore provides an efficient in-memory time-series metric storage system
|
||||
// with support for hierarchical data organization, checkpointing, and archiving.
|
||||
//
|
||||
// The package organizes metrics in a tree structure (cluster → host → component) and
|
||||
// provides concurrent read/write access to metric data with configurable aggregation strategies.
|
||||
// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
|
||||
// and enforcing retention policies.
|
||||
//
|
||||
// Key features:
|
||||
// - In-memory metric storage with configurable retention
|
||||
// - Hierarchical data organization (selectors)
|
||||
// - Concurrent checkpoint/archive workers
|
||||
// - Support for sum and average aggregation
|
||||
// - NATS integration for metric ingestion
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
var (
|
||||
singleton sync.Once
|
||||
msInstance *MemoryStore
|
||||
// shutdownFunc stores the context cancellation function created in Init
|
||||
// and is called during Shutdown to cancel all background goroutines
|
||||
shutdownFunc context.CancelFunc
|
||||
)
|
||||
|
||||
type Metric struct {
|
||||
Name string
|
||||
Value schema.Float
|
||||
MetricConfig MetricConfig
|
||||
}
|
||||
|
||||
type MemoryStore struct {
|
||||
Metrics map[string]MetricConfig
|
||||
root Level
|
||||
}
|
||||
|
||||
func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
|
||||
startupTime := time.Now()
|
||||
|
||||
if rawConfig != nil {
|
||||
config.Validate(configSchema, rawConfig)
|
||||
dec := json.NewDecoder(bytes.NewReader(rawConfig))
|
||||
// dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
// Set NumWorkers from config or use default
|
||||
if Keys.NumWorkers <= 0 {
|
||||
Keys.NumWorkers = min(runtime.NumCPU()/2+1, DefaultMaxWorkers)
|
||||
}
|
||||
cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers)
|
||||
|
||||
// Helper function to add metric configuration
|
||||
addMetricConfig := func(mc schema.MetricConfig) {
|
||||
agg, err := AssignAggregationStrategy(mc.Aggregation)
|
||||
if err != nil {
|
||||
cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error())
|
||||
}
|
||||
|
||||
AddMetric(mc.Name, MetricConfig{
|
||||
Frequency: int64(mc.Timestep),
|
||||
Aggregation: agg,
|
||||
})
|
||||
}
|
||||
|
||||
for _, c := range archive.Clusters {
|
||||
for _, mc := range c.MetricConfig {
|
||||
addMetricConfig(*mc)
|
||||
}
|
||||
|
||||
for _, sc := range c.SubClusters {
|
||||
for _, mc := range sc.MetricConfig {
|
||||
addMetricConfig(mc)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass the config.MetricStoreKeys
|
||||
InitMetrics(Metrics)
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
d, err := time.ParseDuration(Keys.Checkpoints.Restore)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
|
||||
restoreFrom := startupTime.Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
|
||||
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
|
||||
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
|
||||
}
|
||||
|
||||
// Try to use less memory by forcing a GC run here and then
|
||||
// lowering the target percentage. The default of 100 means
|
||||
// that only once the ratio of new allocations execeds the
|
||||
// previously active heap, a GC is triggered.
|
||||
// Forcing a GC here will set the "previously active heap"
|
||||
// to a minumum.
|
||||
runtime.GC()
|
||||
|
||||
ctx, shutdown := context.WithCancel(context.Background())
|
||||
|
||||
wg.Add(4)
|
||||
|
||||
Retention(wg, ctx)
|
||||
Checkpointing(wg, ctx)
|
||||
Archiving(wg, ctx)
|
||||
DataStaging(wg, ctx)
|
||||
|
||||
// Note: Signal handling has been removed from this function.
|
||||
// The caller is responsible for handling shutdown signals and calling
|
||||
// the shutdown() function when appropriate.
|
||||
// Store the shutdown function for later use by Shutdown()
|
||||
shutdownFunc = shutdown
|
||||
|
||||
err = ReceiveNats(ms, 1, ctx)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// InitMetrics creates a new, initialized instance of a MemoryStore.
|
||||
// Will panic if values in the metric configurations are invalid.
|
||||
func InitMetrics(metrics map[string]MetricConfig) {
|
||||
singleton.Do(func() {
|
||||
offset := 0
|
||||
for key, cfg := range metrics {
|
||||
if cfg.Frequency == 0 {
|
||||
panic("[METRICSTORE]> invalid frequency")
|
||||
}
|
||||
|
||||
metrics[key] = MetricConfig{
|
||||
Frequency: cfg.Frequency,
|
||||
Aggregation: cfg.Aggregation,
|
||||
offset: offset,
|
||||
}
|
||||
offset += 1
|
||||
}
|
||||
|
||||
msInstance = &MemoryStore{
|
||||
root: Level{
|
||||
metrics: make([]*buffer, len(metrics)),
|
||||
children: make(map[string]*Level),
|
||||
},
|
||||
Metrics: metrics,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func GetMemoryStore() *MemoryStore {
|
||||
if msInstance == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return msInstance
|
||||
}
|
||||
|
||||
func Shutdown() {
|
||||
// Check if memorystore was initialized
|
||||
if msInstance == nil {
|
||||
cclog.Debug("[METRICSTORE]> MemoryStore not initialized, skipping shutdown")
|
||||
return
|
||||
}
|
||||
|
||||
// Cancel the context to signal all background goroutines to stop
|
||||
if shutdownFunc != nil {
|
||||
shutdownFunc()
|
||||
}
|
||||
|
||||
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
|
||||
var files int
|
||||
var err error
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
|
||||
} else {
|
||||
files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
|
||||
close(LineProtocolMessages)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
|
||||
}
|
||||
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
|
||||
}
|
||||
|
||||
func getName(m *MemoryStore, i int) string {
|
||||
for key, val := range m.Metrics {
|
||||
if val.offset == i {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func Retention(wg *sync.WaitGroup, ctx context.Context) {
|
||||
ms := GetMemoryStore()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.RetentionInMemory)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
tickInterval := d / 2
|
||||
if tickInterval <= 0 {
|
||||
return
|
||||
}
|
||||
ticker := time.NewTicker(tickInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
t := time.Now().Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
|
||||
freed, err := ms.Free(nil, t.Unix())
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
|
||||
// Look at `findLevelOrCreate` for how selectors work.
|
||||
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
|
||||
var ok bool
|
||||
for i, metric := range metrics {
|
||||
if metric.MetricConfig.Frequency == 0 {
|
||||
metric.MetricConfig, ok = m.Metrics[metric.Name]
|
||||
if !ok {
|
||||
metric.MetricConfig.Frequency = 0
|
||||
}
|
||||
metrics[i] = metric
|
||||
}
|
||||
}
|
||||
|
||||
return m.WriteToLevel(&m.root, selector, ts, metrics)
|
||||
}
|
||||
|
||||
func (m *MemoryStore) GetLevel(selector []string) *Level {
|
||||
return m.root.findLevelOrCreate(selector, len(m.Metrics))
|
||||
}
|
||||
|
||||
// WriteToLevel assumes that `minfo` in `metrics` is filled in
|
||||
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
|
||||
l = l.findLevelOrCreate(selector, len(m.Metrics))
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
for _, metric := range metrics {
|
||||
if metric.MetricConfig.Frequency == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
b := l.metrics[metric.MetricConfig.offset]
|
||||
if b == nil {
|
||||
// First write to this metric and level
|
||||
b = newBuffer(ts, metric.MetricConfig.Frequency)
|
||||
l.metrics[metric.MetricConfig.offset] = b
|
||||
}
|
||||
|
||||
nb, err := b.write(ts, metric.Value)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Last write created a new buffer...
|
||||
if b != nb {
|
||||
l.metrics[metric.MetricConfig.offset] = nb
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
|
||||
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
|
||||
// The second and third return value are the actual from/to for the data. Those can be different from
|
||||
// the range asked for if no data was available.
|
||||
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
|
||||
if from > to {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
|
||||
}
|
||||
|
||||
minfo, ok := m.Metrics[metric]
|
||||
if !ok {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> unknown metric: " + metric)
|
||||
}
|
||||
|
||||
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
|
||||
|
||||
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
|
||||
cdata, cfrom, cto, err := b.read(from, to, data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
from, to = cfrom, cto
|
||||
} else if from != cfrom || to != cto || len(data) != len(cdata) {
|
||||
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
|
||||
if missingfront != 0 {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
newlen := len(cdata) - missingback
|
||||
if newlen < 1 {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
cdata = cdata[0:newlen]
|
||||
if len(cdata) != len(data) {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
from, to = cfrom, cto
|
||||
}
|
||||
|
||||
data = cdata
|
||||
n += 1
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, err
|
||||
} else if n == 0 {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
|
||||
} else if n > 1 {
|
||||
if minfo.Aggregation == AvgAggregation {
|
||||
normalize := 1. / schema.Float(n)
|
||||
for i := 0; i < len(data); i++ {
|
||||
data[i] *= normalize
|
||||
}
|
||||
} else if minfo.Aggregation != SumAggregation {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
|
||||
}
|
||||
}
|
||||
|
||||
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, err
|
||||
}
|
||||
|
||||
return data, from, to, resolution, nil
|
||||
}
|
||||
|
||||
// Free releases all buffers for the selected level and all its children that
|
||||
// contain only values older than `t`.
|
||||
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
|
||||
return m.GetLevel(selector).free(t)
|
||||
}
|
||||
|
||||
func (m *MemoryStore) FreeAll() error {
|
||||
for k := range m.root.children {
|
||||
delete(m.root.children, k)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) SizeInBytes() int64 {
|
||||
return m.root.sizeInBytes()
|
||||
}
|
||||
|
||||
// ListChildren , given a selector, returns a list of all children of the level
|
||||
// selected.
|
||||
func (m *MemoryStore) ListChildren(selector []string) []string {
|
||||
lvl := &m.root
|
||||
for lvl != nil && len(selector) != 0 {
|
||||
lvl.lock.RLock()
|
||||
next := lvl.children[selector[0]]
|
||||
lvl.lock.RUnlock()
|
||||
lvl = next
|
||||
selector = selector[1:]
|
||||
}
|
||||
|
||||
if lvl == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
lvl.lock.RLock()
|
||||
defer lvl.lock.RUnlock()
|
||||
|
||||
children := make([]string, 0, len(lvl.children))
|
||||
for child := range lvl.children {
|
||||
children = append(children, child)
|
||||
}
|
||||
|
||||
return children
|
||||
}
|
||||
156
internal/memorystore/memorystore_test.go
Normal file
156
internal/memorystore/memorystore_test.go
Normal file
@@ -0,0 +1,156 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
func TestAssignAggregationStrategy(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected AggregationStrategy
|
||||
wantErr bool
|
||||
}{
|
||||
{"empty string", "", NoAggregation, false},
|
||||
{"sum", "sum", SumAggregation, false},
|
||||
{"avg", "avg", AvgAggregation, false},
|
||||
{"invalid", "invalid", NoAggregation, true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := AssignAggregationStrategy(tt.input)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("AssignAggregationStrategy(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if result != tt.expected {
|
||||
t.Errorf("AssignAggregationStrategy(%q) = %v, want %v", tt.input, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAddMetric(t *testing.T) {
|
||||
// Reset Metrics before test
|
||||
Metrics = make(map[string]MetricConfig)
|
||||
|
||||
err := AddMetric("test_metric", MetricConfig{
|
||||
Frequency: 60,
|
||||
Aggregation: SumAggregation,
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("AddMetric() error = %v", err)
|
||||
}
|
||||
|
||||
if _, ok := Metrics["test_metric"]; !ok {
|
||||
t.Error("AddMetric() did not add metric to Metrics map")
|
||||
}
|
||||
|
||||
// Test updating with higher frequency
|
||||
err = AddMetric("test_metric", MetricConfig{
|
||||
Frequency: 120,
|
||||
Aggregation: SumAggregation,
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("AddMetric() error = %v", err)
|
||||
}
|
||||
|
||||
if Metrics["test_metric"].Frequency != 120 {
|
||||
t.Errorf("AddMetric() frequency = %d, want 120", Metrics["test_metric"].Frequency)
|
||||
}
|
||||
|
||||
// Test updating with lower frequency (should not update)
|
||||
err = AddMetric("test_metric", MetricConfig{
|
||||
Frequency: 30,
|
||||
Aggregation: SumAggregation,
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("AddMetric() error = %v", err)
|
||||
}
|
||||
|
||||
if Metrics["test_metric"].Frequency != 120 {
|
||||
t.Errorf("AddMetric() frequency = %d, want 120 (should not downgrade)", Metrics["test_metric"].Frequency)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMetricFrequency(t *testing.T) {
|
||||
// Reset Metrics before test
|
||||
Metrics = map[string]MetricConfig{
|
||||
"test_metric": {
|
||||
Frequency: 60,
|
||||
Aggregation: SumAggregation,
|
||||
},
|
||||
}
|
||||
|
||||
freq, err := GetMetricFrequency("test_metric")
|
||||
if err != nil {
|
||||
t.Errorf("GetMetricFrequency() error = %v", err)
|
||||
}
|
||||
if freq != 60 {
|
||||
t.Errorf("GetMetricFrequency() = %d, want 60", freq)
|
||||
}
|
||||
|
||||
_, err = GetMetricFrequency("nonexistent")
|
||||
if err == nil {
|
||||
t.Error("GetMetricFrequency() expected error for nonexistent metric")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBufferWrite(t *testing.T) {
|
||||
b := newBuffer(100, 10)
|
||||
|
||||
// Test writing value
|
||||
nb, err := b.write(100, schema.Float(42.0))
|
||||
if err != nil {
|
||||
t.Errorf("buffer.write() error = %v", err)
|
||||
}
|
||||
if nb != b {
|
||||
t.Error("buffer.write() created new buffer unexpectedly")
|
||||
}
|
||||
if len(b.data) != 1 {
|
||||
t.Errorf("buffer.write() len(data) = %d, want 1", len(b.data))
|
||||
}
|
||||
if b.data[0] != schema.Float(42.0) {
|
||||
t.Errorf("buffer.write() data[0] = %v, want 42.0", b.data[0])
|
||||
}
|
||||
|
||||
// Test writing value from past (should error)
|
||||
_, err = b.write(50, schema.Float(10.0))
|
||||
if err == nil {
|
||||
t.Error("buffer.write() expected error for past timestamp")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBufferRead(t *testing.T) {
|
||||
b := newBuffer(100, 10)
|
||||
|
||||
// Write some test data
|
||||
b.write(100, schema.Float(1.0))
|
||||
b.write(110, schema.Float(2.0))
|
||||
b.write(120, schema.Float(3.0))
|
||||
|
||||
// Read data
|
||||
data := make([]schema.Float, 3)
|
||||
result, from, to, err := b.read(100, 130, data)
|
||||
if err != nil {
|
||||
t.Errorf("buffer.read() error = %v", err)
|
||||
}
|
||||
// Buffer read should return from as firstWrite (start + freq/2)
|
||||
if from != 100 {
|
||||
t.Errorf("buffer.read() from = %d, want 100", from)
|
||||
}
|
||||
if to != 130 {
|
||||
t.Errorf("buffer.read() to = %d, want 130", to)
|
||||
}
|
||||
if len(result) != 3 {
|
||||
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
|
||||
}
|
||||
}
|
||||
1080
internal/memorystore/query.go
Normal file
1080
internal/memorystore/query.go
Normal file
File diff suppressed because it is too large
Load Diff
124
internal/memorystore/stats.go
Normal file
124
internal/memorystore/stats.go
Normal file
@@ -0,0 +1,124 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
type Stats struct {
|
||||
Samples int
|
||||
Avg util.Float
|
||||
Min util.Float
|
||||
Max util.Float
|
||||
}
|
||||
|
||||
func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) {
|
||||
if from < b.start {
|
||||
if b.prev != nil {
|
||||
return b.prev.stats(from, to)
|
||||
}
|
||||
from = b.start
|
||||
}
|
||||
|
||||
// TODO: Check if b.closed and if so and the full buffer is queried,
|
||||
// use b.statistics instead of iterating over the buffer.
|
||||
|
||||
samples := 0
|
||||
sum, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
|
||||
var t int64
|
||||
for t = from; t < to; t += b.frequency {
|
||||
idx := int((t - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
b = b.next
|
||||
if b == nil {
|
||||
break
|
||||
}
|
||||
idx = 0
|
||||
}
|
||||
|
||||
if t < b.start || idx >= len(b.data) {
|
||||
continue
|
||||
}
|
||||
|
||||
xf := float64(b.data[idx])
|
||||
if math.IsNaN(xf) {
|
||||
continue
|
||||
}
|
||||
|
||||
samples += 1
|
||||
sum += xf
|
||||
min = math.Min(min, xf)
|
||||
max = math.Max(max, xf)
|
||||
}
|
||||
|
||||
return Stats{
|
||||
Samples: samples,
|
||||
Avg: util.Float(sum) / util.Float(samples),
|
||||
Min: util.Float(min),
|
||||
Max: util.Float(max),
|
||||
}, from, t, nil
|
||||
}
|
||||
|
||||
// Returns statistics for the requested metric on the selected node/level.
|
||||
// Data is aggregated to the selected level the same way as in `MemoryStore.Read`.
|
||||
// If `Stats.Samples` is zero, the statistics should not be considered as valid.
|
||||
func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int64) (*Stats, int64, int64, error) {
|
||||
if from > to {
|
||||
return nil, 0, 0, errors.New("invalid time range")
|
||||
}
|
||||
|
||||
minfo, ok := m.Metrics[metric]
|
||||
if !ok {
|
||||
return nil, 0, 0, errors.New("unknown metric: " + metric)
|
||||
}
|
||||
|
||||
n, samples := 0, 0
|
||||
avg, min, max := util.Float(0), math.MaxFloat32, -math.MaxFloat32
|
||||
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
|
||||
stats, cfrom, cto, err := b.stats(from, to)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
from, to = cfrom, cto
|
||||
} else if from != cfrom || to != cto {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
samples += stats.Samples
|
||||
avg += stats.Avg
|
||||
min = math.Min(min, float64(stats.Min))
|
||||
max = math.Max(max, float64(stats.Max))
|
||||
n += 1
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
return nil, 0, 0, ErrNoData
|
||||
}
|
||||
|
||||
if minfo.Aggregation == AvgAggregation {
|
||||
avg /= util.Float(n)
|
||||
} else if n > 1 && minfo.Aggregation != SumAggregation {
|
||||
return nil, 0, 0, errors.New("invalid aggregation")
|
||||
}
|
||||
|
||||
return &Stats{
|
||||
Samples: samples,
|
||||
Avg: avg,
|
||||
Min: util.Float(min),
|
||||
Max: util.Float(max),
|
||||
}, from, to, nil
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,313 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
|
||||
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
|
||||
)
|
||||
|
||||
type InfluxDBv2DataRepositoryConfig struct {
|
||||
Url string `json:"url"`
|
||||
Token string `json:"token"`
|
||||
Bucket string `json:"bucket"`
|
||||
Org string `json:"org"`
|
||||
SkipTls bool `json:"skiptls"`
|
||||
}
|
||||
|
||||
type InfluxDBv2DataRepository struct {
|
||||
client influxdb2.Client
|
||||
queryClient influxdb2Api.QueryAPI
|
||||
bucket, measurement string
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) Init(rawConfig json.RawMessage) error {
|
||||
var config InfluxDBv2DataRepositoryConfig
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
log.Warn("Error while unmarshaling raw json config")
|
||||
return err
|
||||
}
|
||||
|
||||
idb.client = influxdb2.NewClientWithOptions(config.Url, config.Token, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: config.SkipTls}))
|
||||
idb.queryClient = idb.client.QueryAPI(config.Org)
|
||||
idb.bucket = config.Bucket
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string {
|
||||
return t.Format(time.RFC3339) // Like “2006-01-02T15:04:05Z07:00”
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) epochToTime(epoch int64) time.Time {
|
||||
return time.Unix(epoch, 0)
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
|
||||
measurementsConds := make([]string, 0, len(metrics))
|
||||
for _, m := range metrics {
|
||||
measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m))
|
||||
}
|
||||
measurementsCond := strings.Join(measurementsConds, " or ")
|
||||
|
||||
hostsConds := make([]string, 0, len(job.Resources))
|
||||
for _, h := range job.Resources {
|
||||
if h.HWThreads != nil || h.Accelerators != nil {
|
||||
// TODO
|
||||
return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
|
||||
}
|
||||
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
|
||||
}
|
||||
hostsCond := strings.Join(hostsConds, " or ")
|
||||
|
||||
jobData := make(schema.JobData) // Empty Schema: map[<string>FIELD]map[<MetricScope>SCOPE]<*JobMetric>METRIC
|
||||
// Requested Scopes
|
||||
for _, scope := range scopes {
|
||||
query := ""
|
||||
switch scope {
|
||||
case "node":
|
||||
// Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows
|
||||
// log.Info("Scope 'node' requested. ")
|
||||
query = fmt.Sprintf(`
|
||||
from(bucket: "%s")
|
||||
|> range(start: %s, stop: %s)
|
||||
|> filter(fn: (r) => (%s) and (%s) )
|
||||
|> drop(columns: ["_start", "_stop"])
|
||||
|> group(columns: ["hostname", "_measurement"])
|
||||
|> aggregateWindow(every: 60s, fn: mean)
|
||||
|> drop(columns: ["_time"])`,
|
||||
idb.bucket,
|
||||
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
|
||||
measurementsCond, hostsCond)
|
||||
case "socket":
|
||||
log.Info("Scope 'socket' requested, but not yet supported: Will return 'node' scope only. ")
|
||||
continue
|
||||
case "core":
|
||||
log.Info(" Scope 'core' requested, but not yet supported: Will return 'node' scope only. ")
|
||||
continue
|
||||
// Get Finest Granularity only, Set NULL to 0.0
|
||||
// query = fmt.Sprintf(`
|
||||
// from(bucket: "%s")
|
||||
// |> range(start: %s, stop: %s)
|
||||
// |> filter(fn: (r) => %s )
|
||||
// |> filter(fn: (r) => %s )
|
||||
// |> drop(columns: ["_start", "_stop", "cluster"])
|
||||
// |> map(fn: (r) => (if exists r._value then {r with _value: r._value} else {r with _value: 0.0}))`,
|
||||
// idb.bucket,
|
||||
// idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )),
|
||||
// measurementsCond, hostsCond)
|
||||
default:
|
||||
log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope)
|
||||
continue
|
||||
// return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support other scopes than 'node'")
|
||||
}
|
||||
|
||||
rows, err := idb.queryClient.Query(ctx, query)
|
||||
if err != nil {
|
||||
log.Error("Error while performing query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Init Metrics: Only Node level now -> TODO: Matching /check on scope level ...
|
||||
for _, metric := range metrics {
|
||||
jobMetric, ok := jobData[metric]
|
||||
if !ok {
|
||||
mc := archive.GetMetricConfig(job.Cluster, metric)
|
||||
jobMetric = map[schema.MetricScope]*schema.JobMetric{
|
||||
scope: { // uses scope var from above!
|
||||
Unit: mc.Unit,
|
||||
Timestep: mc.Timestep,
|
||||
Series: make([]schema.Series, 0, len(job.Resources)),
|
||||
StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
|
||||
},
|
||||
}
|
||||
}
|
||||
jobData[metric] = jobMetric
|
||||
}
|
||||
|
||||
// Process Result: Time-Data
|
||||
field, host, hostSeries := "", "", schema.Series{}
|
||||
// typeId := 0
|
||||
switch scope {
|
||||
case "node":
|
||||
for rows.Next() {
|
||||
row := rows.Record()
|
||||
if host == "" || host != row.ValueByKey("hostname").(string) || rows.TableChanged() {
|
||||
if host != "" {
|
||||
// Append Series before reset
|
||||
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
|
||||
}
|
||||
field, host = row.Measurement(), row.ValueByKey("hostname").(string)
|
||||
hostSeries = schema.Series{
|
||||
Hostname: host,
|
||||
Statistics: schema.MetricStatistics{}, //TODO Add Statistics
|
||||
Data: make([]schema.Float, 0),
|
||||
}
|
||||
}
|
||||
val, ok := row.Value().(float64)
|
||||
if ok {
|
||||
hostSeries.Data = append(hostSeries.Data, schema.Float(val))
|
||||
} else {
|
||||
hostSeries.Data = append(hostSeries.Data, schema.Float(0))
|
||||
}
|
||||
}
|
||||
case "socket":
|
||||
continue
|
||||
case "core":
|
||||
continue
|
||||
// Include Series.Id in hostSeries
|
||||
// for rows.Next() {
|
||||
// row := rows.Record()
|
||||
// if ( host == "" || host != row.ValueByKey("hostname").(string) || typeId != row.ValueByKey("type-id").(int) || rows.TableChanged() ) {
|
||||
// if ( host != "" ) {
|
||||
// // Append Series before reset
|
||||
// jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
|
||||
// }
|
||||
// field, host, typeId = row.Measurement(), row.ValueByKey("hostname").(string), row.ValueByKey("type-id").(int)
|
||||
// hostSeries = schema.Series{
|
||||
// Hostname: host,
|
||||
// Id: &typeId,
|
||||
// Statistics: nil,
|
||||
// Data: make([]schema.Float, 0),
|
||||
// }
|
||||
// }
|
||||
// val := row.Value().(float64)
|
||||
// hostSeries.Data = append(hostSeries.Data, schema.Float(val))
|
||||
// }
|
||||
default:
|
||||
log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope)
|
||||
continue
|
||||
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node, core'")
|
||||
}
|
||||
// Append last Series
|
||||
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
|
||||
}
|
||||
|
||||
// Get Stats
|
||||
stats, err := idb.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading statistics")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, scope := range scopes {
|
||||
if scope == "node" { // No 'socket/core' support yet
|
||||
for metric, nodes := range stats {
|
||||
for node, stats := range nodes {
|
||||
for index, _ := range jobData[metric][scope].Series {
|
||||
if jobData[metric][scope].Series[index].Hostname == node {
|
||||
jobData[metric][scope].Series[index].Statistics = schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
|
||||
stats := map[string]map[string]schema.MetricStatistics{}
|
||||
|
||||
hostsConds := make([]string, 0, len(job.Resources))
|
||||
for _, h := range job.Resources {
|
||||
if h.HWThreads != nil || h.Accelerators != nil {
|
||||
// TODO
|
||||
return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
|
||||
}
|
||||
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
|
||||
}
|
||||
hostsCond := strings.Join(hostsConds, " or ")
|
||||
|
||||
// lenMet := len(metrics)
|
||||
|
||||
for _, metric := range metrics {
|
||||
// log.Debugf("<< You are here: %s (Index %d of %d metrics)", metric, index, lenMet)
|
||||
|
||||
query := fmt.Sprintf(`
|
||||
data = from(bucket: "%s")
|
||||
|> range(start: %s, stop: %s)
|
||||
|> filter(fn: (r) => r._measurement == "%s" and r._field == "value" and (%s))
|
||||
union(tables: [data |> mean(column: "_value") |> set(key: "_field", value: "avg"),
|
||||
data |> min(column: "_value") |> set(key: "_field", value: "min"),
|
||||
data |> max(column: "_value") |> set(key: "_field", value: "max")])
|
||||
|> pivot(rowKey: ["hostname"], columnKey: ["_field"], valueColumn: "_value")
|
||||
|> group()`,
|
||||
idb.bucket,
|
||||
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
|
||||
metric, hostsCond)
|
||||
|
||||
rows, err := idb.queryClient.Query(ctx, query)
|
||||
if err != nil {
|
||||
log.Error("Error while performing query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodes := map[string]schema.MetricStatistics{}
|
||||
for rows.Next() {
|
||||
row := rows.Record()
|
||||
host := row.ValueByKey("hostname").(string)
|
||||
|
||||
avg, avgok := row.ValueByKey("avg").(float64)
|
||||
if !avgok {
|
||||
// log.Debugf(">> Assertion error for metric %s, statistic AVG. Expected 'float64', got %v", metric, avg)
|
||||
avg = 0.0
|
||||
}
|
||||
min, minok := row.ValueByKey("min").(float64)
|
||||
if !minok {
|
||||
// log.Debugf(">> Assertion error for metric %s, statistic MIN. Expected 'float64', got %v", metric, min)
|
||||
min = 0.0
|
||||
}
|
||||
max, maxok := row.ValueByKey("max").(float64)
|
||||
if !maxok {
|
||||
// log.Debugf(">> Assertion error for metric %s, statistic MAX. Expected 'float64', got %v", metric, max)
|
||||
max = 0.0
|
||||
}
|
||||
|
||||
nodes[host] = schema.MetricStatistics{
|
||||
Avg: avg,
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
stats[metric] = nodes
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
|
||||
// TODO : Implement to be used in Analysis- und System/Node-View
|
||||
log.Infof("LoadNodeData unimplemented for InfluxDBv2DataRepository, Args: cluster %s, metrics %v, nodes %v, scopes %v", cluster, metrics, nodes, scopes)
|
||||
|
||||
return nil, errors.New("METRICDATA/INFLUXV2 > unimplemented for InfluxDBv2DataRepository")
|
||||
}
|
||||
@@ -1,21 +1,19 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
type MetricDataRepository interface {
|
||||
@@ -24,333 +22,106 @@ type MetricDataRepository interface {
|
||||
Init(rawConfig json.RawMessage) error
|
||||
|
||||
// Return the JobData for the given job, only with the requested metrics.
|
||||
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
|
||||
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
|
||||
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
|
||||
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics at the requested scopes for that node.
|
||||
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
|
||||
LoadScopedStats(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.ScopedJobStats, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
|
||||
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
|
||||
LoadNodeListData(cluster, subCluster string, nodes, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, ctx context.Context) (map[string]schema.JobData, error)
|
||||
}
|
||||
|
||||
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
|
||||
var upstreamMetricDataRepo MetricDataRepository
|
||||
|
||||
var useArchive bool
|
||||
// func Init() error {
|
||||
// for _, cluster := range config.Clusters {
|
||||
// if cluster.MetricDataRepository != nil {
|
||||
// var kind struct {
|
||||
// Kind string `json:"kind"`
|
||||
// }
|
||||
// if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
|
||||
// cclog.Warn("Error while unmarshaling raw json MetricDataRepository")
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// var mdr MetricDataRepository
|
||||
// switch kind.Kind {
|
||||
// case "cc-metric-store":
|
||||
// mdr = &CCMetricStore{}
|
||||
// case "prometheus":
|
||||
// mdr = &PrometheusDataRepository{}
|
||||
// case "test":
|
||||
// mdr = &TestMetricDataRepository{}
|
||||
// default:
|
||||
// return fmt.Errorf("METRICDATA/METRICDATA > Unknown MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
|
||||
// }
|
||||
//
|
||||
// if err := mdr.Init(cluster.MetricDataRepository); err != nil {
|
||||
// cclog.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
|
||||
// return err
|
||||
// }
|
||||
// metricDataRepos[cluster.Name] = mdr
|
||||
// }
|
||||
// }
|
||||
// return nil
|
||||
// }
|
||||
|
||||
func Init(disableArchive bool) error {
|
||||
useArchive = !disableArchive
|
||||
for _, cluster := range config.Keys.Clusters {
|
||||
if cluster.MetricDataRepository != nil {
|
||||
var kind struct {
|
||||
Kind string `json:"kind"`
|
||||
}
|
||||
if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
|
||||
log.Warn("Error while unmarshaling raw json MetricDataRepository")
|
||||
return err
|
||||
}
|
||||
// func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
|
||||
// var err error
|
||||
// repo, ok := metricDataRepos[cluster]
|
||||
//
|
||||
// if !ok {
|
||||
// err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
// }
|
||||
//
|
||||
// return repo, err
|
||||
// }
|
||||
|
||||
var mdr MetricDataRepository
|
||||
switch kind.Kind {
|
||||
case "cc-metric-store":
|
||||
mdr = &CCMetricStore{}
|
||||
case "influxdb":
|
||||
mdr = &InfluxDBv2DataRepository{}
|
||||
case "prometheus":
|
||||
mdr = &PrometheusDataRepository{}
|
||||
case "test":
|
||||
mdr = &TestMetricDataRepository{}
|
||||
default:
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > Unknown MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
|
||||
}
|
||||
|
||||
if err := mdr.Init(cluster.MetricDataRepository); err != nil {
|
||||
log.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
|
||||
return err
|
||||
}
|
||||
metricDataRepos[cluster.Name] = mdr
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
// Fetches the metric data for a job.
|
||||
func LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
||||
!useArchive {
|
||||
|
||||
repo, ok := metricDataRepos[job.Cluster]
|
||||
|
||||
if !ok {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
|
||||
}
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := archive.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = repo.LoadData(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
log.Errorf("partial error: %s", err.Error())
|
||||
return err, 0, 0
|
||||
} else {
|
||||
log.Error("Error while loading job data from metric repository")
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
jd, err = archive.GetHandle().LoadJobData(job)
|
||||
if err != nil {
|
||||
log.Error("Error while loading job data from archive")
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
// Avoid sending unrequested data to the client:
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = jd.Size()
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
prepareJobData(job, jd, scopes)
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
log.Error("Error in returned dataset")
|
||||
return nil, err
|
||||
// InitUpstreamRepos initializes global upstream metric data repository for the pull worker
|
||||
func InitUpstreamRepos() error {
|
||||
if config.Keys.UpstreamMetricRepository == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
||||
func LoadAverages(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
data [][]schema.Float,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
if job.State != schema.JobStateRunning && useArchive {
|
||||
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
|
||||
var kind struct {
|
||||
Kind string `json:"kind"`
|
||||
}
|
||||
|
||||
repo, ok := metricDataRepos[job.Cluster]
|
||||
if !ok {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
|
||||
if err != nil {
|
||||
log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
|
||||
if err := json.Unmarshal(*config.Keys.UpstreamMetricRepository, &kind); err != nil {
|
||||
cclog.Warn("Error while unmarshaling raw json UpstreamMetricRepository")
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
var mdr MetricDataRepository
|
||||
switch kind.Kind {
|
||||
case "cc-metric-store":
|
||||
mdr = &CCMetricStore{}
|
||||
case "prometheus":
|
||||
mdr = &PrometheusDataRepository{}
|
||||
case "test":
|
||||
mdr = &TestMetricDataRepository{}
|
||||
default:
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > Unknown UpstreamMetricRepository %v", kind.Kind)
|
||||
}
|
||||
|
||||
if err := mdr.Init(*config.Keys.UpstreamMetricRepository); err != nil {
|
||||
cclog.Errorf("Error initializing UpstreamMetricRepository %v", kind.Kind)
|
||||
return err
|
||||
}
|
||||
upstreamMetricDataRepo = mdr
|
||||
cclog.Infof("Initialized global upstream metric repository '%s'", kind.Kind)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Used for the node/system view. Returns a map of nodes to a map of metrics.
|
||||
func LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
repo, ok := metricDataRepos[cluster]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
// GetUpstreamMetricDataRepo returns the global upstream metric data repository
|
||||
func GetUpstreamMetricDataRepo() (MetricDataRepository, error) {
|
||||
if upstreamMetricDataRepo == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no upstream metric data repository configured")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
log.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
log.Error("Error while loading node data from metric repository")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func cacheKey(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
) string {
|
||||
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
|
||||
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]",
|
||||
job.ID, job.State, metrics, scopes)
|
||||
}
|
||||
|
||||
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
|
||||
// to be available at the scope 'node'. If a job has a lot of nodes,
|
||||
// statisticsSeries should be available so that a min/mean/max Graph can be
|
||||
// used instead of a lot of single lines.
|
||||
func prepareJobData(
|
||||
job *schema.Job,
|
||||
jobData schema.JobData,
|
||||
scopes []schema.MetricScope,
|
||||
) {
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jobData {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jobData.AddNodeScope("flops_any")
|
||||
jobData.AddNodeScope("mem_bw")
|
||||
}
|
||||
}
|
||||
|
||||
// Writes a running job to the job-archive
|
||||
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
allMetrics := make([]string, 0)
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
for _, mc := range metricConfigs {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
// TODO: Talk about this! What resolutions to store data at...
|
||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||
if job.NumNodes <= 8 {
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
}
|
||||
|
||||
jobData, err := LoadData(job, allMetrics, scopes, ctx)
|
||||
if err != nil {
|
||||
log.Error("Error wile loading job data for archiving")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobMeta := &schema.JobMeta{
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
Statistics: make(map[string]schema.JobStatistics),
|
||||
}
|
||||
|
||||
for metric, data := range jobData {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// TODO/FIXME: Calc average for non-node metrics as well!
|
||||
continue
|
||||
}
|
||||
|
||||
for _, series := range nodeData.Series {
|
||||
avg += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
}
|
||||
|
||||
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: schema.Unit{
|
||||
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||
},
|
||||
Avg: avg / float64(job.NumNodes),
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
|
||||
// If the file based archive is disabled,
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if !useArchive {
|
||||
return jobMeta, nil
|
||||
}
|
||||
|
||||
return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData)
|
||||
return upstreamMetricDataRepo, nil
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) 2022 DKRZ
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
@@ -21,8 +22,8 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
promapi "github.com/prometheus/client_golang/api"
|
||||
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||
promcfg "github.com/prometheus/common/config"
|
||||
@@ -159,17 +160,17 @@ func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
|
||||
var config PrometheusDataRepositoryConfig
|
||||
// parse config
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
log.Warn("Error while unmarshaling raw json config")
|
||||
cclog.Warn("Error while unmarshaling raw json config")
|
||||
return err
|
||||
}
|
||||
// support basic authentication
|
||||
var rt http.RoundTripper = nil
|
||||
if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
|
||||
prom_pw := promcfg.Secret(prom_pw)
|
||||
rt = promcfg.NewBasicAuthRoundTripper(config.Username, prom_pw, "", promapi.DefaultRoundTripper)
|
||||
rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
|
||||
} else {
|
||||
if config.Username != "" {
|
||||
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set.")
|
||||
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
|
||||
}
|
||||
}
|
||||
// init client
|
||||
@@ -178,7 +179,7 @@ func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
|
||||
RoundTripper: rt,
|
||||
})
|
||||
if err != nil {
|
||||
log.Error("Error while initializing new prometheus client")
|
||||
cclog.Error("Error while initializing new prometheus client")
|
||||
return err
|
||||
}
|
||||
// init query client
|
||||
@@ -191,9 +192,9 @@ func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
|
||||
for metric, templ := range config.Templates {
|
||||
pdb.templates[metric], err = template.New(metric).Parse(templ)
|
||||
if err == nil {
|
||||
log.Debugf("Added PromQL template for %s: %s", metric, templ)
|
||||
cclog.Debugf("Added PromQL template for %s: %s", metric, templ)
|
||||
} else {
|
||||
log.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
|
||||
cclog.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
@@ -204,8 +205,8 @@ func (pdb *PrometheusDataRepository) FormatQuery(
|
||||
metric string,
|
||||
scope schema.MetricScope,
|
||||
nodes []string,
|
||||
cluster string) (string, error) {
|
||||
|
||||
cluster string,
|
||||
) (string, error) {
|
||||
args := PromQLArgs{}
|
||||
if len(nodes) > 0 {
|
||||
args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
|
||||
@@ -220,7 +221,7 @@ func (pdb *PrometheusDataRepository) FormatQuery(
|
||||
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > Error compiling template %v", templ))
|
||||
} else {
|
||||
query := buf.String()
|
||||
log.Debugf("PromQL: %s", query)
|
||||
cclog.Debugf("PromQL: %s", query)
|
||||
return query, nil
|
||||
}
|
||||
} else {
|
||||
@@ -233,12 +234,13 @@ func (pdb *PrometheusDataRepository) RowToSeries(
|
||||
from time.Time,
|
||||
step int64,
|
||||
steps int64,
|
||||
row *promm.SampleStream) schema.Series {
|
||||
row *promm.SampleStream,
|
||||
) schema.Series {
|
||||
ts := from.Unix()
|
||||
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
|
||||
// init array of expected length with NaN
|
||||
values := make([]schema.Float, steps+1)
|
||||
for i, _ := range values {
|
||||
for i := range values {
|
||||
values[i] = schema.NaN
|
||||
}
|
||||
// copy recorded values from prom sample pair
|
||||
@@ -263,8 +265,9 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
// TODO respect requested scope
|
||||
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
@@ -276,13 +279,13 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
for i, resource := range job.Resources {
|
||||
nodes[i] = resource.Hostname
|
||||
}
|
||||
from := job.StartTime
|
||||
to := job.StartTime.Add(time.Duration(job.Duration) * time.Second)
|
||||
from := time.Unix(job.StartTime, 0)
|
||||
to := time.Unix(job.StartTime+int64(job.Duration), 0)
|
||||
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
log.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
cclog.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
@@ -290,12 +293,12 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
for _, metric := range metrics {
|
||||
metricConfig := archive.GetMetricConfig(job.Cluster, metric)
|
||||
if metricConfig == nil {
|
||||
log.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
|
||||
cclog.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
|
||||
return nil, errors.New("Prometheus config error")
|
||||
}
|
||||
query, err := pdb.FormatQuery(metric, scope, nodes, job.Cluster)
|
||||
if err != nil {
|
||||
log.Warn("Error while formatting prometheus query")
|
||||
cclog.Warn("Error while formatting prometheus query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -306,13 +309,12 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
|
||||
cclog.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
log.Warnf("Warnings: %v\n", warnings)
|
||||
cclog.Warnf("Warnings: %v\n", warnings)
|
||||
}
|
||||
|
||||
// init data structures
|
||||
@@ -335,7 +337,7 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
pdb.RowToSeries(from, step, steps, row))
|
||||
}
|
||||
// only add metric if at least one host returned data
|
||||
if !ok && len(jobMetric.Series) > 0{
|
||||
if !ok && len(jobMetric.Series) > 0 {
|
||||
jobData[metric][scope] = jobMetric
|
||||
}
|
||||
// sort by hostname to get uniform coloring
|
||||
@@ -351,14 +353,14 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
func (pdb *PrometheusDataRepository) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
|
||||
ctx context.Context,
|
||||
) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
// map of metrics of nodes of stats
|
||||
stats := map[string]map[string]schema.MetricStatistics{}
|
||||
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx)
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading job for stats")
|
||||
cclog.Warn("Error while loading job for stats")
|
||||
return nil, err
|
||||
}
|
||||
for metric, metricData := range data {
|
||||
@@ -376,7 +378,8 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
t0 := time.Now()
|
||||
// Map of hosts of metrics of value slices
|
||||
data := make(map[string]map[string][]*schema.JobMetric)
|
||||
@@ -388,19 +391,19 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
log.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
metricConfig := archive.GetMetricConfig(cluster, metric)
|
||||
if metricConfig == nil {
|
||||
log.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
|
||||
cclog.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
|
||||
return nil, errors.New("Prometheus config error")
|
||||
}
|
||||
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
|
||||
if err != nil {
|
||||
log.Warn("Error while formatting prometheus query")
|
||||
cclog.Warn("Error while formatting prometheus query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -411,13 +414,12 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
|
||||
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
log.Warnf("Warnings: %v\n", warnings)
|
||||
cclog.Warnf("Warnings: %v\n", warnings)
|
||||
}
|
||||
|
||||
step := int64(metricConfig.Timestep)
|
||||
@@ -442,6 +444,145 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
}
|
||||
}
|
||||
t1 := time.Since(t0)
|
||||
log.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
|
||||
cclog.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Implemented by NHR@FAU; Used in Job-View StatsTable
|
||||
func (pdb *PrometheusDataRepository) LoadScopedStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
// Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable
|
||||
scopedJobStats := make(schema.ScopedJobStats)
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while loading job for scopedJobStats")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for metric, metricData := range data {
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := scopedJobStats[metric]; !ok {
|
||||
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
|
||||
}
|
||||
|
||||
if _, ok := scopedJobStats[metric][scope]; !ok {
|
||||
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
|
||||
}
|
||||
|
||||
for _, series := range metricData[scope].Series {
|
||||
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
|
||||
Hostname: series.Hostname,
|
||||
Data: &series.Statistics,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return scopedJobStats, nil
|
||||
}
|
||||
|
||||
// Implemented by NHR@FAU; Used in NodeList-View
|
||||
func (pdb *PrometheusDataRepository) LoadNodeListData(
|
||||
cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, error) {
|
||||
// Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList
|
||||
|
||||
// Fetch Data, based on pdb.LoadNodeData()
|
||||
t0 := time.Now()
|
||||
// Map of hosts of jobData
|
||||
data := make(map[string]schema.JobData)
|
||||
|
||||
// query db for each metric
|
||||
// TODO: scopes seems to be always empty
|
||||
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
for _, metric := range metrics {
|
||||
metricConfig := archive.GetMetricConfig(cluster, metric)
|
||||
if metricConfig == nil {
|
||||
cclog.Warnf("Error in LoadNodeListData: Metric %s for cluster %s not configured", metric, cluster)
|
||||
return nil, errors.New("Prometheus config error")
|
||||
}
|
||||
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while formatting prometheus query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// ranged query over all nodes
|
||||
r := promv1.Range{
|
||||
Start: from,
|
||||
End: to,
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
if err != nil {
|
||||
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
cclog.Warnf("Warnings: %v\n", warnings)
|
||||
}
|
||||
|
||||
step := int64(metricConfig.Timestep)
|
||||
steps := int64(to.Sub(from).Seconds()) / step
|
||||
|
||||
// iter rows of host, metric, values
|
||||
for _, row := range result.(promm.Matrix) {
|
||||
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
|
||||
|
||||
hostdata, ok := data[hostname]
|
||||
if !ok {
|
||||
hostdata = make(schema.JobData)
|
||||
data[hostname] = hostdata
|
||||
}
|
||||
|
||||
metricdata, ok := hostdata[metric]
|
||||
if !ok {
|
||||
metricdata = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
data[hostname][metric] = metricdata
|
||||
}
|
||||
|
||||
// output per host, metric and scope
|
||||
scopeData, ok := metricdata[scope]
|
||||
if !ok {
|
||||
scopeData = &schema.JobMetric{
|
||||
Unit: metricConfig.Unit,
|
||||
Timestep: metricConfig.Timestep,
|
||||
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
|
||||
}
|
||||
data[hostname][metric][scope] = scopeData
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
t1 := time.Since(t0)
|
||||
cclog.Debugf("LoadNodeListData of %v nodes took %s", len(data), t1)
|
||||
return data, nil
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
@@ -9,14 +10,14 @@ import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
// Only a mock for unit-testing.
|
||||
// TestMetricDataRepository is only a mock for unit-testing.
|
||||
type TestMetricDataRepository struct{}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
|
||||
@@ -27,15 +28,26 @@ func (tmdr *TestMetricDataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
|
||||
return TestLoadDataCallback(job, metrics, scopes, ctx)
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadScopedStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
@@ -44,7 +56,19 @@ func (tmdr *TestMetricDataRepository) LoadNodeData(
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadNodeListData(
|
||||
cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
490
internal/metricdispatcher/dataLoader.go
Normal file
490
internal/metricdispatcher/dataLoader.go
Normal file
@@ -0,0 +1,490 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package metricdispatcher provides a unified interface for loading and caching job metric data.
|
||||
//
|
||||
// This package serves as a central dispatcher that routes metric data requests to the appropriate
|
||||
// backend based on job state. For running jobs, data is fetched from the metric store (e.g., cc-metric-store).
|
||||
// For completed jobs, data is retrieved from the file-based job archive.
|
||||
//
|
||||
// # Key Features
|
||||
//
|
||||
// - Automatic backend selection based on job state (running vs. archived)
|
||||
// - LRU cache for performance optimization (128 MB default cache size)
|
||||
// - Data resampling using Largest Triangle Three Bucket algorithm for archived data
|
||||
// - Automatic statistics series generation for jobs with many nodes
|
||||
// - Support for scoped metrics (node, socket, accelerator, core)
|
||||
//
|
||||
// # Cache Behavior
|
||||
//
|
||||
// Cached data has different TTL (time-to-live) values depending on job state:
|
||||
// - Running jobs: 2 minutes (data changes frequently)
|
||||
// - Completed jobs: 5 hours (data is static)
|
||||
//
|
||||
// The cache key is based on job ID, state, requested metrics, scopes, and resolution.
|
||||
//
|
||||
// # Usage
|
||||
//
|
||||
// The primary entry point is LoadData, which automatically handles both running and archived jobs:
|
||||
//
|
||||
// jobData, err := metricdispatcher.LoadData(job, metrics, scopes, ctx, resolution)
|
||||
// if err != nil {
|
||||
// // Handle error
|
||||
// }
|
||||
//
|
||||
// For statistics only, use LoadJobStats, LoadScopedJobStats, or LoadAverages depending on the required format.
|
||||
package metricdispatcher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/lrucache"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// cache is an LRU cache with 128 MB capacity for storing loaded job metric data.
|
||||
// The cache reduces load on both the metric store and archive backends.
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
// cacheKey generates a unique cache key for a job's metric data based on job ID, state,
|
||||
// requested metrics, scopes, and resolution. Duration and StartTime are intentionally excluded
|
||||
// because job.ID is more unique and the cache TTL ensures entries don't persist indefinitely.
|
||||
func cacheKey(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) string {
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
|
||||
job.ID, job.State, metrics, scopes, resolution)
|
||||
}
|
||||
|
||||
// LoadData retrieves metric data for a job from the appropriate backend (memory store for running jobs,
|
||||
// archive for completed jobs) and applies caching, resampling, and statistics generation as needed.
|
||||
//
|
||||
// For running jobs or when archive is disabled, data is fetched from the metric store.
|
||||
// For completed archived jobs, data is loaded from the job archive and resampled if needed.
|
||||
//
|
||||
// Parameters:
|
||||
// - job: The job for which to load metric data
|
||||
// - metrics: List of metric names to load (nil loads all metrics for the cluster)
|
||||
// - scopes: Metric scopes to include (nil defaults to node scope)
|
||||
// - ctx: Context for cancellation and timeouts
|
||||
// - resolution: Target number of data points for resampling (only applies to archived data)
|
||||
//
|
||||
// Returns the loaded job data and any error encountered. For partial errors (some metrics failed),
|
||||
// the function returns the successfully loaded data with a warning logged.
|
||||
func LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
||||
config.Keys.DisableArchive {
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := archive.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = memorystore.LoadData(job, metrics, scopes, ctx, resolution)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s",
|
||||
job.JobID, job.User, job.Project, err.Error())
|
||||
} else {
|
||||
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
|
||||
job.JobID, job.User, job.Project, err.Error())
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
var jdTemp schema.JobData
|
||||
jdTemp, err = archive.GetHandle().LoadJobData(job)
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to load job data from archive for job %d (user: %s, project: %s): %s",
|
||||
job.JobID, job.User, job.Project, err.Error())
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
jd = deepCopy(jdTemp)
|
||||
|
||||
// Resample archived data using Largest Triangle Three Bucket algorithm to reduce data points
|
||||
// to the requested resolution, improving transfer performance and client-side rendering.
|
||||
for _, v := range jd {
|
||||
for _, v_ := range v {
|
||||
timestep := int64(0)
|
||||
for i := 0; i < len(v_.Series); i += 1 {
|
||||
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
|
||||
if err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
v_.Timestep = int(timestep)
|
||||
}
|
||||
}
|
||||
|
||||
// Filter job data to only include requested metrics and scopes, avoiding unnecessary data transfer.
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = jd.Size()
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
// Generate statistics series for jobs with many nodes to enable min/median/max graphs
|
||||
// instead of overwhelming the UI with individual node lines. Note that newly calculated
|
||||
// statistics use min/median/max, while archived statistics may use min/mean/max.
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jd.AddNodeScope("flops_any")
|
||||
jd.AddNodeScope("mem_bw")
|
||||
}
|
||||
|
||||
// Round Resulting Stat Values
|
||||
jd.RoundMetricStats()
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
cclog.Errorf("error in cached dataset for job %d: %s", job.JobID, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// LoadAverages computes average values for the specified metrics across all nodes of a job.
|
||||
// For running jobs, it loads statistics from the metric store. For completed jobs, it uses
|
||||
// the pre-calculated averages from the job archive. The results are appended to the data slice.
|
||||
func LoadAverages(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
data [][]schema.Float,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
|
||||
}
|
||||
|
||||
stats, err := memorystore.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
|
||||
job.JobID, job.User, job.Project, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadScopedJobStats retrieves job statistics organized by metric scope (node, socket, core, accelerator).
|
||||
// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
|
||||
// statistics are loaded from the job archive.
|
||||
func LoadScopedJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
|
||||
}
|
||||
|
||||
scopedStats, err := memorystore.LoadScopedStats(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s",
|
||||
job.JobID, job.User, job.Project, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return scopedStats, nil
|
||||
}
|
||||
|
||||
// LoadJobStats retrieves aggregated statistics (min/avg/max) for each requested metric across all job nodes.
|
||||
// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
|
||||
// statistics are loaded from the job archive.
|
||||
func LoadJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.MetricStatistics, error) {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadStatsFromArchive(job, metrics)
|
||||
}
|
||||
|
||||
data := make(map[string]schema.MetricStatistics, len(metrics))
|
||||
|
||||
stats, err := memorystore.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
|
||||
job.JobID, job.User, job.Project, err.Error())
|
||||
return data, err
|
||||
}
|
||||
|
||||
for _, m := range metrics {
|
||||
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
|
||||
continue
|
||||
}
|
||||
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
min = math.Min(min, node.Min)
|
||||
max = math.Max(max, node.Max)
|
||||
}
|
||||
|
||||
data[m] = schema.MetricStatistics{
|
||||
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
|
||||
Min: (math.Round(min*100) / 100),
|
||||
Max: (math.Round(max*100) / 100),
|
||||
}
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// LoadNodeData retrieves metric data for specific nodes in a cluster within a time range.
|
||||
// This is used for node monitoring views and system status pages. Data is always fetched from
|
||||
// the metric store (not the archive) since it's for current/recent node status monitoring.
|
||||
//
|
||||
// Returns a nested map structure: node -> metric -> scoped data.
|
||||
func LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := memorystore.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
|
||||
} else {
|
||||
cclog.Errorf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("metric store for cluster '%s' does not support node data queries", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// LoadNodeListData retrieves time-series metric data for multiple nodes within a time range,
|
||||
// with optional resampling and automatic statistics generation for large datasets.
|
||||
// This is used for comparing multiple nodes or displaying node status over time.
|
||||
//
|
||||
// Returns a map of node names to their job-like metric data structures.
|
||||
func LoadNodeListData(
|
||||
cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, error) {
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := memorystore.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",
|
||||
cluster, subCluster, err.Error())
|
||||
} else {
|
||||
cclog.Errorf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
|
||||
cluster, subCluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// Generate statistics series for datasets with many series to improve visualization performance.
|
||||
// Statistics are calculated as min/median/max.
|
||||
const maxSeriesSize int = 8
|
||||
for _, jd := range data {
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("metric store for cluster '%s' does not support node list queries", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// deepCopy creates a deep copy of JobData to prevent cache corruption when modifying
|
||||
// archived data (e.g., during resampling). This ensures the cached archive data remains
|
||||
// immutable while allowing per-request transformations.
|
||||
func deepCopy(source schema.JobData) schema.JobData {
|
||||
result := make(schema.JobData, len(source))
|
||||
|
||||
for metricName, scopeMap := range source {
|
||||
result[metricName] = make(map[schema.MetricScope]*schema.JobMetric, len(scopeMap))
|
||||
|
||||
for scope, jobMetric := range scopeMap {
|
||||
result[metricName][scope] = copyJobMetric(jobMetric)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func copyJobMetric(src *schema.JobMetric) *schema.JobMetric {
|
||||
dst := &schema.JobMetric{
|
||||
Timestep: src.Timestep,
|
||||
Unit: src.Unit,
|
||||
Series: make([]schema.Series, len(src.Series)),
|
||||
}
|
||||
|
||||
for i := range src.Series {
|
||||
dst.Series[i] = copySeries(&src.Series[i])
|
||||
}
|
||||
|
||||
if src.StatisticsSeries != nil {
|
||||
dst.StatisticsSeries = copyStatisticsSeries(src.StatisticsSeries)
|
||||
}
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func copySeries(src *schema.Series) schema.Series {
|
||||
dst := schema.Series{
|
||||
Hostname: src.Hostname,
|
||||
Id: src.Id,
|
||||
Statistics: src.Statistics,
|
||||
Data: make([]schema.Float, len(src.Data)),
|
||||
}
|
||||
|
||||
copy(dst.Data, src.Data)
|
||||
return dst
|
||||
}
|
||||
|
||||
func copyStatisticsSeries(src *schema.StatsSeries) *schema.StatsSeries {
|
||||
dst := &schema.StatsSeries{
|
||||
Min: make([]schema.Float, len(src.Min)),
|
||||
Mean: make([]schema.Float, len(src.Mean)),
|
||||
Median: make([]schema.Float, len(src.Median)),
|
||||
Max: make([]schema.Float, len(src.Max)),
|
||||
}
|
||||
|
||||
copy(dst.Min, src.Min)
|
||||
copy(dst.Mean, src.Mean)
|
||||
copy(dst.Median, src.Median)
|
||||
copy(dst.Max, src.Max)
|
||||
|
||||
if len(src.Percentiles) > 0 {
|
||||
dst.Percentiles = make(map[int][]schema.Float, len(src.Percentiles))
|
||||
for percentile, values := range src.Percentiles {
|
||||
dst.Percentiles[percentile] = make([]schema.Float, len(values))
|
||||
copy(dst.Percentiles[percentile], values)
|
||||
}
|
||||
}
|
||||
|
||||
return dst
|
||||
}
|
||||
125
internal/metricdispatcher/dataLoader_test.go
Normal file
125
internal/metricdispatcher/dataLoader_test.go
Normal file
@@ -0,0 +1,125 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdispatcher
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
func TestDeepCopy(t *testing.T) {
|
||||
nodeId := "0"
|
||||
original := schema.JobData{
|
||||
"cpu_load": {
|
||||
schema.MetricScopeNode: &schema.JobMetric{
|
||||
Timestep: 60,
|
||||
Unit: schema.Unit{Base: "load", Prefix: ""},
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "node001",
|
||||
Id: &nodeId,
|
||||
Data: []schema.Float{1.0, 2.0, 3.0},
|
||||
Statistics: schema.MetricStatistics{
|
||||
Min: 1.0,
|
||||
Avg: 2.0,
|
||||
Max: 3.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
StatisticsSeries: &schema.StatsSeries{
|
||||
Min: []schema.Float{1.0, 1.5, 2.0},
|
||||
Mean: []schema.Float{2.0, 2.5, 3.0},
|
||||
Median: []schema.Float{2.0, 2.5, 3.0},
|
||||
Max: []schema.Float{3.0, 3.5, 4.0},
|
||||
Percentiles: map[int][]schema.Float{
|
||||
25: {1.5, 2.0, 2.5},
|
||||
75: {2.5, 3.0, 3.5},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
copied := deepCopy(original)
|
||||
|
||||
original["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] = 999.0
|
||||
original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] = 888.0
|
||||
original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] = 777.0
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] != 1.0 {
|
||||
t.Errorf("Series data was not deeply copied: got %v, want 1.0",
|
||||
copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0])
|
||||
}
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] != 1.0 {
|
||||
t.Errorf("StatisticsSeries was not deeply copied: got %v, want 1.0",
|
||||
copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0])
|
||||
}
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] != 1.5 {
|
||||
t.Errorf("Percentiles was not deeply copied: got %v, want 1.5",
|
||||
copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0])
|
||||
}
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].Timestep != 60 {
|
||||
t.Errorf("Timestep not copied correctly: got %v, want 60",
|
||||
copied["cpu_load"][schema.MetricScopeNode].Timestep)
|
||||
}
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname != "node001" {
|
||||
t.Errorf("Hostname not copied correctly: got %v, want node001",
|
||||
copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeepCopyNilStatisticsSeries(t *testing.T) {
|
||||
original := schema.JobData{
|
||||
"mem_used": {
|
||||
schema.MetricScopeNode: &schema.JobMetric{
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "node001",
|
||||
Data: []schema.Float{1.0, 2.0},
|
||||
},
|
||||
},
|
||||
StatisticsSeries: nil,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
copied := deepCopy(original)
|
||||
|
||||
if copied["mem_used"][schema.MetricScopeNode].StatisticsSeries != nil {
|
||||
t.Errorf("StatisticsSeries should be nil, got %v",
|
||||
copied["mem_used"][schema.MetricScopeNode].StatisticsSeries)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeepCopyEmptyPercentiles(t *testing.T) {
|
||||
original := schema.JobData{
|
||||
"cpu_load": {
|
||||
schema.MetricScopeNode: &schema.JobMetric{
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{},
|
||||
StatisticsSeries: &schema.StatsSeries{
|
||||
Min: []schema.Float{1.0},
|
||||
Mean: []schema.Float{2.0},
|
||||
Median: []schema.Float{2.0},
|
||||
Max: []schema.Float{3.0},
|
||||
Percentiles: nil,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
copied := deepCopy(original)
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles != nil {
|
||||
t.Errorf("Percentiles should be nil when source is nil/empty")
|
||||
}
|
||||
}
|
||||
68
internal/repository/config.go
Normal file
68
internal/repository/config.go
Normal file
@@ -0,0 +1,68 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package repository
|
||||
|
||||
import "time"
|
||||
|
||||
// RepositoryConfig holds configuration for repository operations.
|
||||
// All fields have sensible defaults, so this configuration is optional.
|
||||
type RepositoryConfig struct {
|
||||
// CacheSize is the LRU cache size in bytes for job metadata and energy footprints.
|
||||
// Default: 1MB (1024 * 1024 bytes)
|
||||
CacheSize int
|
||||
|
||||
// MaxOpenConnections is the maximum number of open database connections.
|
||||
// Default: 4
|
||||
MaxOpenConnections int
|
||||
|
||||
// MaxIdleConnections is the maximum number of idle database connections.
|
||||
// Default: 4
|
||||
MaxIdleConnections int
|
||||
|
||||
// ConnectionMaxLifetime is the maximum amount of time a connection may be reused.
|
||||
// Default: 1 hour
|
||||
ConnectionMaxLifetime time.Duration
|
||||
|
||||
// ConnectionMaxIdleTime is the maximum amount of time a connection may be idle.
|
||||
// Default: 1 hour
|
||||
ConnectionMaxIdleTime time.Duration
|
||||
|
||||
// MinRunningJobDuration is the minimum duration in seconds for a job to be
|
||||
// considered in "running jobs" queries. This filters out very short jobs.
|
||||
// Default: 600 seconds (10 minutes)
|
||||
MinRunningJobDuration int
|
||||
}
|
||||
|
||||
// DefaultConfig returns the default repository configuration.
|
||||
// These values are optimized for typical deployments.
|
||||
func DefaultConfig() *RepositoryConfig {
|
||||
return &RepositoryConfig{
|
||||
CacheSize: 1 * 1024 * 1024, // 1MB
|
||||
MaxOpenConnections: 4,
|
||||
MaxIdleConnections: 4,
|
||||
ConnectionMaxLifetime: time.Hour,
|
||||
ConnectionMaxIdleTime: time.Hour,
|
||||
MinRunningJobDuration: 600, // 10 minutes
|
||||
}
|
||||
}
|
||||
|
||||
// repoConfig is the package-level configuration instance.
|
||||
// It is initialized with defaults and can be overridden via SetConfig.
|
||||
var repoConfig *RepositoryConfig = DefaultConfig()
|
||||
|
||||
// SetConfig sets the repository configuration.
|
||||
// This must be called before any repository initialization (Connect, GetJobRepository, etc.).
|
||||
// If not called, default values from DefaultConfig() are used.
|
||||
func SetConfig(cfg *RepositoryConfig) {
|
||||
if cfg != nil {
|
||||
repoConfig = cfg
|
||||
}
|
||||
}
|
||||
|
||||
// GetConfig returns the current repository configuration.
|
||||
func GetConfig() *RepositoryConfig {
|
||||
return repoConfig
|
||||
}
|
||||
@@ -1,15 +1,18 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package repository
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/mattn/go-sqlite3"
|
||||
"github.com/qustavo/sqlhooks/v2"
|
||||
@@ -33,43 +36,63 @@ type DatabaseOptions struct {
|
||||
ConnectionMaxIdleTime time.Duration
|
||||
}
|
||||
|
||||
func setupSqlite(db *sql.DB) error {
|
||||
pragmas := []string{
|
||||
"temp_store = memory",
|
||||
}
|
||||
|
||||
for _, pragma := range pragmas {
|
||||
_, err := db.Exec("PRAGMA " + pragma)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func Connect(driver string, db string) {
|
||||
var err error
|
||||
var dbHandle *sqlx.DB
|
||||
|
||||
if driver != "sqlite3" {
|
||||
cclog.Abortf("Unsupported database driver '%s'. Only 'sqlite3' is supported.\n", driver)
|
||||
}
|
||||
|
||||
dbConnOnce.Do(func() {
|
||||
opts := DatabaseOptions{
|
||||
URL: db,
|
||||
MaxOpenConnections: 4,
|
||||
MaxIdleConnections: 4,
|
||||
ConnectionMaxLifetime: time.Hour,
|
||||
ConnectionMaxIdleTime: time.Hour,
|
||||
MaxOpenConnections: repoConfig.MaxOpenConnections,
|
||||
MaxIdleConnections: repoConfig.MaxIdleConnections,
|
||||
ConnectionMaxLifetime: repoConfig.ConnectionMaxLifetime,
|
||||
ConnectionMaxIdleTime: repoConfig.ConnectionMaxIdleTime,
|
||||
}
|
||||
|
||||
switch driver {
|
||||
case "sqlite3":
|
||||
// - Set WAL mode (not strictly necessary each time because it's persisted in the database, but good for first run)
|
||||
// - Set busy timeout, so concurrent writers wait on each other instead of erroring immediately
|
||||
// - Enable foreign key checks
|
||||
opts.URL += "?_journal=WAL&_timeout=5000&_fk=true"
|
||||
// TODO: Have separate DB handles for Writes and Reads
|
||||
// Optimize SQLite connection: https://kerkour.com/sqlite-for-servers
|
||||
connectionURLParams := make(url.Values)
|
||||
connectionURLParams.Add("_txlock", "immediate")
|
||||
connectionURLParams.Add("_journal_mode", "WAL")
|
||||
connectionURLParams.Add("_busy_timeout", "5000")
|
||||
connectionURLParams.Add("_synchronous", "NORMAL")
|
||||
connectionURLParams.Add("_cache_size", "1000000000")
|
||||
connectionURLParams.Add("_foreign_keys", "true")
|
||||
opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionURLParams.Encode())
|
||||
|
||||
if log.Loglevel() == "debug" {
|
||||
sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{}))
|
||||
dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL)
|
||||
} else {
|
||||
dbHandle, err = sqlx.Open("sqlite3", opts.URL)
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
case "mysql":
|
||||
opts.URL += "?multiStatements=true"
|
||||
dbHandle, err = sqlx.Open("mysql", opts.URL)
|
||||
if err != nil {
|
||||
log.Fatalf("sqlx.Open() error: %v", err)
|
||||
}
|
||||
default:
|
||||
log.Fatalf("unsupported database driver: %s", driver)
|
||||
if cclog.Loglevel() == "debug" {
|
||||
sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{}))
|
||||
dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL)
|
||||
} else {
|
||||
dbHandle, err = sqlx.Open("sqlite3", opts.URL)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cclog.Abortf("DB Connection: Could not connect to SQLite database with sqlx.Open().\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
err = setupSqlite(dbHandle.DB)
|
||||
if err != nil {
|
||||
cclog.Abortf("Failed sqlite db setup.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
dbHandle.SetMaxOpenConns(opts.MaxOpenConnections)
|
||||
@@ -78,16 +101,16 @@ func Connect(driver string, db string) {
|
||||
dbHandle.SetConnMaxIdleTime(opts.ConnectionMaxIdleTime)
|
||||
|
||||
dbConnInstance = &DBConnection{DB: dbHandle, Driver: driver}
|
||||
err = checkDBVersion(driver, dbHandle.DB)
|
||||
err = checkDBVersion(dbHandle.DB)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
cclog.Abortf("DB Connection: Failed DB version check.\nError: %s\n", err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func GetConnection() *DBConnection {
|
||||
if dbConnInstance == nil {
|
||||
log.Fatalf("Database connection not initialized!")
|
||||
cclog.Fatalf("Database connection not initialized!")
|
||||
}
|
||||
|
||||
return dbConnInstance
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
@@ -8,21 +8,21 @@ import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
// Hooks satisfies the sqlhook.Hooks interface
|
||||
type Hooks struct{}
|
||||
|
||||
// Before hook will print the query with it's args and return the context with the timestamp
|
||||
func (h *Hooks) Before(ctx context.Context, query string, args ...interface{}) (context.Context, error) {
|
||||
log.Debugf("SQL query %s %q", query, args)
|
||||
func (h *Hooks) Before(ctx context.Context, query string, args ...any) (context.Context, error) {
|
||||
cclog.Debugf("SQL query %s %q", query, args)
|
||||
return context.WithValue(ctx, "begin", time.Now()), nil
|
||||
}
|
||||
|
||||
// After hook will get the timestamp registered on the Before hook and print the elapsed time
|
||||
func (h *Hooks) After(ctx context.Context, query string, args ...interface{}) (context.Context, error) {
|
||||
func (h *Hooks) After(ctx context.Context, query string, args ...any) (context.Context, error) {
|
||||
begin := ctx.Value("begin").(time.Time)
|
||||
log.Debugf("Took: %s\n", time.Since(begin))
|
||||
cclog.Debugf("Took: %s\n", time.Since(begin))
|
||||
return ctx, nil
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
148
internal/repository/jobCreate.go
Normal file
148
internal/repository/jobCreate.go
Normal file
@@ -0,0 +1,148 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package repository
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
const NamedJobCacheInsert string = `INSERT INTO job_cache (
|
||||
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
|
||||
) VALUES (
|
||||
:job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
|
||||
);`
|
||||
|
||||
const NamedJobInsert string = `INSERT INTO job (
|
||||
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
|
||||
) VALUES (
|
||||
:job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
|
||||
);`
|
||||
|
||||
func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {
|
||||
r.Mutex.Lock()
|
||||
defer r.Mutex.Unlock()
|
||||
|
||||
res, err := r.DB.NamedExec(NamedJobCacheInsert, job)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while NamedJobInsert")
|
||||
return 0, err
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
cclog.Warn("Error while getting last insert ID")
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return id, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) SyncJobs() ([]*schema.Job, error) {
|
||||
r.Mutex.Lock()
|
||||
defer r.Mutex.Unlock()
|
||||
|
||||
query := sq.Select(jobCacheColumns...).From("job_cache")
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while running query %v", err)
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
jobs := make([]*schema.Job, 0, 50)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
_, err = r.DB.Exec(
|
||||
"INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache")
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while Job sync: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache")
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while Job cache clean: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// Start inserts a new job in the table, returning the unique job ID.
|
||||
// Statistics are not transfered!
|
||||
func (r *JobRepository) Start(job *schema.Job) (id int64, err error) {
|
||||
job.RawFootprint, err = json.Marshal(job.Footprint)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err)
|
||||
}
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
|
||||
}
|
||||
|
||||
return r.InsertJob(job)
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) Stop(
|
||||
jobID int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32,
|
||||
) (err error) {
|
||||
// Invalidate cache entries as job state is changing
|
||||
r.cache.Del(fmt.Sprintf("metadata:%d", jobID))
|
||||
r.cache.Del(fmt.Sprintf("energyFootprint:%d", jobID))
|
||||
|
||||
stmt := sq.Update("job").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobID)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return err
|
||||
}
|
||||
|
||||
func (r *JobRepository) StopCached(
|
||||
jobID int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32,
|
||||
) (err error) {
|
||||
// Note: StopCached updates job_cache table, not the main job table
|
||||
// Cache invalidation happens when job is synced to main table
|
||||
stmt := sq.Update("job_cache").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job_cache.id = ?", jobID)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return err
|
||||
}
|
||||
322
internal/repository/jobFind.go
Normal file
322
internal/repository/jobFind.go
Normal file
@@ -0,0 +1,322 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) Find(
|
||||
jobID *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) (*schema.Job, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobID)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
q = q.OrderBy("job.id DESC") // always use newest matching job by db id if more than one match
|
||||
|
||||
cclog.Debugf("Timer Find %s", time.Since(start))
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindCached(
|
||||
jobID *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) (*schema.Job, error) {
|
||||
q := sq.Select(jobCacheColumns...).From("job_cache").
|
||||
Where("job_cache.job_id = ?", *jobID)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job_cache.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job_cache.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindAll executes a SQL query to find all batch jobs matching the given criteria.
|
||||
// Jobs are queried using the batch job id, and optionally filtered by cluster name
|
||||
// and start time (UNIX epoch time seconds).
|
||||
// It returns a slice of pointers to schema.Job data structures and an error variable.
|
||||
// An empty slice is returned if no matching jobs are found.
|
||||
func (r *JobRepository) FindAll(
|
||||
jobID *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) ([]*schema.Job, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobID)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
jobs := make([]*schema.Job, 0, 10)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
cclog.Debugf("Timer FindAll %s", time.Since(start))
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// GetJobList returns job IDs for non-running jobs.
|
||||
// This is useful to process large job counts and intended to be used
|
||||
// together with FindById to process jobs one by one.
|
||||
// Use limit and offset for pagination. Use limit=0 to get all results (not recommended for large datasets).
|
||||
func (r *JobRepository) GetJobList(limit int, offset int) ([]int64, error) {
|
||||
query := sq.Select("id").From("job").
|
||||
Where("job.job_state != 'running'")
|
||||
|
||||
// Add pagination if limit is specified
|
||||
if limit > 0 {
|
||||
query = query.Limit(uint64(limit)).Offset(uint64(offset))
|
||||
}
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
jl := make([]int64, 0, 1000)
|
||||
for rows.Next() {
|
||||
var id int64
|
||||
err := rows.Scan(&id)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jl = append(jl, id)
|
||||
}
|
||||
|
||||
cclog.Infof("Return job count %d", len(jl))
|
||||
return jl, nil
|
||||
}
|
||||
|
||||
// FindByID executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByID(ctx context.Context, jobID int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobID)
|
||||
|
||||
q, qerr := SecurityCheck(ctx, q)
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByIDWithUser executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id. The user is passed directly,
|
||||
// instead as part of the context.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByIDWithUser(user *schema.User, jobID int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobID)
|
||||
|
||||
q, qerr := SecurityCheckWithUser(user, q)
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByIDDirect executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByIDDirect(jobID int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobID)
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByJobID executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the slurm id and the clustername.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByJobID(ctx context.Context, jobID int64, startTime int64, cluster string) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").
|
||||
Where("job.job_id = ?", jobID).
|
||||
Where("job.cluster = ?", cluster).
|
||||
Where("job.start_time = ?", startTime)
|
||||
|
||||
q, qerr := SecurityCheck(ctx, q)
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// IsJobOwner executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the slurm id,a username and the cluster.
|
||||
// It returns a bool.
|
||||
// If job was found, user is owner: test err != sql.ErrNoRows
|
||||
func (r *JobRepository) IsJobOwner(jobID int64, startTime int64, user string, cluster string) bool {
|
||||
q := sq.Select("id").
|
||||
From("job").
|
||||
Where("job.job_id = ?", jobID).
|
||||
Where("job.hpc_user = ?", user).
|
||||
Where("job.cluster = ?", cluster).
|
||||
Where("job.start_time = ?", startTime)
|
||||
|
||||
_, err := scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
return err != sql.ErrNoRows
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindConcurrentJobs(
|
||||
ctx context.Context,
|
||||
job *schema.Job,
|
||||
) (*model.JobLinkResultList, error) {
|
||||
if job == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("job.id", "job.job_id", "job.start_time").From("job"))
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
query = query.Where("cluster = ?", job.Cluster)
|
||||
var startTime int64
|
||||
var stopTime int64
|
||||
|
||||
startTime = job.StartTime
|
||||
hostname := job.Resources[0].Hostname
|
||||
|
||||
if job.State == schema.JobStateRunning {
|
||||
stopTime = time.Now().Unix()
|
||||
} else {
|
||||
stopTime = startTime + int64(job.Duration)
|
||||
}
|
||||
|
||||
// Add 200s overlap for jobs start time at the end
|
||||
startTimeTail := startTime + 10
|
||||
stopTimeTail := stopTime - 200
|
||||
startTimeFront := startTime + 200
|
||||
|
||||
queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTime)
|
||||
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
|
||||
queryRunning = queryRunning.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
|
||||
|
||||
query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
|
||||
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
|
||||
query = query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while running query: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
items := make([]*model.JobLink, 0, 10)
|
||||
queryString := fmt.Sprintf("cluster=%s", job.Cluster)
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobID, startTime sql.NullInt64
|
||||
|
||||
if err = rows.Scan(&id, &jobID, &startTime); err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobID.Int64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
rows, err = queryRunning.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while running query: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobID, startTime sql.NullInt64
|
||||
|
||||
if err := rows.Scan(&id, &jobID, &startTime); err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobID.Int64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
cnt := len(items)
|
||||
|
||||
return &model.JobLinkResultList{
|
||||
ListQuery: &queryString,
|
||||
Items: items,
|
||||
Count: &cnt,
|
||||
}, nil
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user