mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-11-15 14:53:47 +01:00
Compare commits
794 Commits
v1.3.1
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
68e0159292 | ||
| 8555a88202 | |||
|
|
bb357f7cab | ||
|
|
d9b240cd2d | ||
|
|
bea5ee96d9 | ||
|
|
7d205fd526 | ||
|
|
c15b2a0cbb | ||
|
|
7ccba30a3d | ||
|
|
8091485588 | ||
|
|
1413f968d6 | ||
|
|
d1d1bb09e9 | ||
|
|
3c1a7e0171 | ||
|
|
0cb50f2f01 | ||
|
|
2287586700 | ||
|
|
ea7660ddb3 | ||
|
|
44e98e8f2f | ||
|
|
856ccbb969 | ||
|
|
0920286b4c | ||
|
|
f34e10cfd9 | ||
| bc43c844fc | |||
| 67be9aa27b | |||
| 047b997a22 | |||
| bac51891b7 | |||
|
|
714d6af7cd | ||
| 6efd6334bb | |||
| 91f4475d76 | |||
|
|
de309784b4 | ||
|
|
a623cf53f3 | ||
| 440cd59e50 | |||
| eefb6f6265 | |||
| f5e1226837 | |||
| 151f7e701f | |||
| 40398497c2 | |||
|
|
cda10788fb | ||
|
|
845905d9c8 | ||
| 89055506d6 | |||
|
|
5908ae7905 | ||
|
|
4131665284 | ||
|
|
6a43dfb0d7 | ||
| 3d38d78845 | |||
| 600f19ac80 | |||
|
|
0a3a664653 | ||
|
|
471ec1cd2e | ||
|
|
e296cd7ca0 | ||
|
|
31cfa8cd7c | ||
|
|
70fe8aa367 | ||
|
|
cc9dafac6f | ||
|
|
32429f1481 | ||
| 9485a463b8 | |||
| 35c6ab4a08 | |||
| e58b0fa015 | |||
| beb92967e5 | |||
| 015583f1cd | |||
| d40c54b802 | |||
| 647665b6b9 | |||
| 4fc78bc382 | |||
| 50d000e7e2 | |||
|
|
ad500c4bef | ||
|
|
916077c6f8 | ||
|
|
935fb238a4 | ||
|
|
d03e5b4562 | ||
|
|
05c45c6468 | ||
| 9020613a8b | |||
| be92d5943d | |||
|
|
b2368a0751 | ||
| 7948d5f773 | |||
|
|
1a16851ad0 | ||
|
|
810c14a839 | ||
|
|
df0e8eb228 | ||
| 79605c8a9e | |||
|
|
9b644119ae | ||
|
|
ffa9919019 | ||
|
55ca892f90
|
|||
|
eaca187032
|
|||
|
|
3b9d05cc6d | ||
| d00881de2e | |||
| d8e85cf75d | |||
| 39f21763e4 | |||
|
|
af43901ca3 | ||
|
|
62565b9ae2 | ||
|
|
bca176170c | ||
|
|
2a91ca0cff | ||
|
|
19a75554b0 | ||
|
|
58ae476a3e | ||
|
|
44d8254a0b | ||
|
|
bd2cdfcef2 | ||
| a50b832c2a | |||
|
|
10194105e3 | ||
|
|
b474288df7 | ||
|
|
f338209f32 | ||
|
|
bef832e45b | ||
|
|
71cfb4db77 | ||
| 86453e7e11 | |||
|
|
98b9f8e62d | ||
| 44cd8d258d | |||
| 764b65d094 | |||
|
|
4d2c64b012 | ||
|
|
35c0b0be58 | ||
|
|
7a54e2cfb3 | ||
|
|
54283f6d3c | ||
|
|
697acd1d88 | ||
|
|
5cdb80b4d6 | ||
|
|
e48ff8be73 | ||
|
|
096217eea6 | ||
|
|
ed5290be86 | ||
|
|
b036c3903c | ||
|
|
57b43b7b60 | ||
| ab1ddb7bd1 | |||
| 881f2f32f4 | |||
| 0754ba5292 | |||
|
|
743a89c3a2 | ||
|
|
6692c3ab7c | ||
|
|
c16a5fdac4 | ||
|
|
60ec7e54f5 | ||
| dd48f5ab87 | |||
|
|
db674ec31d | ||
|
|
48150ffc8b | ||
|
|
1ad80efab6 | ||
|
|
aa8789f8f8 | ||
|
|
56e3f2da5c | ||
|
|
a4104822e2 | ||
|
|
c13f386e3b | ||
| 4bd73450b5 | |||
| 64da28e814 | |||
| 639e1b9c6d | |||
|
|
63e828d2df | ||
|
|
b8c30b5703 | ||
|
|
805ea91fc2 | ||
|
|
c4c422da57 | ||
| 544fb35121 | |||
| 43edccb284 | |||
| 7531ba4b5c | |||
| 983aa592d8 | |||
| 8378784231 | |||
| dca25cc601 | |||
|
|
c8fe81cd80 | ||
| c0a4724f57 | |||
| 484c52d813 | |||
|
|
47843b2087 | ||
|
|
c3a6126799 | ||
|
|
e94b250541 | ||
|
|
db5f6c7540 | ||
|
|
79a6c9e90d | ||
| e2e67e3977 | |||
| 6c06450701 | |||
|
|
d7379a1af2 | ||
|
|
d731611e0c | ||
|
|
dceb92ba8e | ||
|
|
1e039cb1bf | ||
|
6f3e1ffbe3
|
|||
|
|
6a6dca3fce | ||
|
|
d6d92071bf | ||
|
|
d40657dc64 | ||
|
|
6dde2a1e59 | ||
|
|
b7823cec16 | ||
|
|
eabd7b8d51 | ||
|
|
27ec445e54 | ||
|
|
ad108b285f | ||
|
|
f471214ef7 | ||
|
|
a0190f8f40 | ||
| 82af984023 | |||
| 0373010497 | |||
|
|
c22d869aa7 | ||
| 87c93e90cd | |||
| 3d6dca9386 | |||
|
|
f946e7e6ab | ||
|
|
d50dfa5867 | ||
| 249128e011 | |||
| ca16a80b1f | |||
|
|
e789e7ba9b | ||
|
|
5048f7be14 | ||
|
|
0e3603f596 | ||
| 9cd4b3c1cc | |||
| 1d9aa75960 | |||
|
|
0a24ef70e0 | ||
| 3b5d3d671e | |||
| 7db83d216e | |||
| d1a7002422 | |||
| 1d8e7e072f | |||
|
7466fe7a34
|
|||
|
|
24cf5047da | ||
|
|
1f103e5ef5 | ||
|
|
9e87974eb1 | ||
|
|
d806cf76c4 | ||
|
|
6e2703998d | ||
| 6f9737c2c2 | |||
|
|
5e696c10d5 | ||
|
|
927e25c72c | ||
| 8b1b99ba35 | |||
| 2c102cd1ff | |||
|
|
42c4926c47 | ||
|
|
703556d893 | ||
|
|
0b529a5c3c | ||
|
|
5186b3f61e | ||
| 4dc0da5099 | |||
| 1bad6ba065 | |||
| 3efee22536 | |||
| eef48ac3a3 | |||
| e35cfbc3dd | |||
| 4a5fd96b32 | |||
|
|
bdffe73f59 | ||
| cdfe722457 | |||
| 0aecea6de2 | |||
| 5a88c77171 | |||
| 8003217092 | |||
| 9b325041c1 | |||
| 1e7fbe5d56 | |||
| 0261c263f9 | |||
| 8d6ae85b0d | |||
| f14bdb3068 | |||
| 3c66840f95 | |||
| 733e3ea9d5 | |||
|
ca634bb707
|
|||
| 9abc206d1a | |||
| 85f17c0fd8 | |||
| 14bad81b9f | |||
|
|
ffd596e2c7 | ||
| 99f8187092 | |||
| f30b784f45 | |||
| f06b5f8fc0 | |||
| 2e781b900d | |||
| d76b1ae75d | |||
| 40110580e0 | |||
| eab7961a83 | |||
| 432e06e801 | |||
| fe1ff5c7a3 | |||
| 6e66b8e08b | |||
| 7abdd0545e | |||
|
|
3f1768e467 | ||
|
|
f464921ae3 | ||
|
|
7603ad3fb0 | ||
|
|
be7ccc78b8 | ||
|
|
b3135c982f | ||
|
13386175f5
|
|||
|
23e8f3dc2d
|
|||
|
|
b323ce2eef | ||
|
|
08e323ba51 | ||
|
|
9f50f36b1d | ||
|
|
4399c1d590 | ||
|
|
f7376f6dca | ||
|
|
518cb34340 | ||
|
|
f210a5f508 | ||
|
|
9ebc49dd1c | ||
|
|
c119eeb468 | ||
|
|
ab616f8f79 | ||
|
|
69286881e4 | ||
|
|
4419df8d1b | ||
|
|
aed2bd48fc | ||
|
|
d3d752f90c | ||
|
|
33ecfe88ef | ||
|
|
fd52fdd35b | ||
|
|
1d13d3dccf | ||
|
|
1c84bcae35 | ||
|
|
df497d5952 | ||
|
|
f65e122f8d | ||
| 161f0744aa | |||
| 95de9ad3b3 | |||
|
|
d5c170055f | ||
|
|
61f0521072 | ||
|
|
6ca14c55f2 | ||
|
|
1309d09aee | ||
| aba75b3a19 | |||
|
|
e87481d8db | ||
| acaad69917 | |||
|
|
ff588ad57a | ||
| 65df27154c | |||
| 8dfa1957f4 | |||
| 570eba3794 | |||
| 94a39fc61f | |||
| 2d359e5f99 | |||
|
|
04692e0c44 | ||
|
|
809fd23b88 | ||
|
|
e3653daea3 | ||
|
|
48fa75386c | ||
|
|
1b3a12a4dc | ||
|
|
543ddf540e | ||
|
|
a3fb471546 | ||
|
|
277f964b30 | ||
|
|
9bcf7adb67 | ||
|
|
f343fa0071 | ||
|
|
e5862e9218 | ||
|
|
29ae2423f8 | ||
|
|
1755a4a7df | ||
|
|
25d3325049 | ||
|
|
fb6a4c3b87 | ||
| 317f80a984 | |||
| 28cdc1d9e5 | |||
| c2087b15d5 | |||
| a8d785beb3 | |||
|
|
a6784b5549 | ||
|
|
d770292be8 | ||
|
|
b3a1037ade | ||
|
|
02946cf0b4 | ||
|
|
cf051d5108 | ||
|
|
96977c6183 | ||
|
|
73d83164fc | ||
|
|
1064f5e4a8 | ||
|
|
5be98c7087 | ||
|
|
0d689c7dff | ||
|
|
1f24ed46a0 | ||
|
|
92b4159f9e | ||
|
|
5817b41e29 | ||
| d6b132e3a6 | |||
|
|
318f70f34c | ||
|
|
e41525d40a | ||
|
|
a102220e52 | ||
|
|
e9a214c5b2 | ||
|
|
c53f5eb144 | ||
|
|
9ed64e0388 | ||
|
|
93040d4629 | ||
|
|
0144ad43f5 | ||
|
|
8da2fc30c3 | ||
| 0e27ae7795 | |||
| 33c6cdb9fe | |||
|
|
73b7014469 | ||
| 25aaf55b93 | |||
| 6a7546c43b | |||
| 0adda4bf7b | |||
|
|
f5f36427a4 | ||
|
|
590bfd3a10 | ||
|
|
16db9bd1a2 | ||
|
|
d0af933b35 | ||
|
|
2b56b40e6d | ||
|
|
4b2d7068b3 | ||
|
|
bd93b8be8e | ||
|
|
aa3fe2b872 | ||
|
|
a61ff915ac | ||
|
|
0a3e678329 | ||
|
|
d4336b0dcb | ||
|
|
65d2698af4 | ||
|
|
6454576417 | ||
|
|
a485bd5977 | ||
|
|
e733688fd0 | ||
|
|
e86f6a8cbd | ||
|
|
fcc9e17664 | ||
|
|
5c9d4ffa9a | ||
|
|
419bc2747b | ||
|
|
1ee99d6866 | ||
|
|
3ab8973895 | ||
|
|
acfa3baeb5 | ||
|
|
c21d7cf101 | ||
|
|
ec895e1d9e | ||
|
|
c964f09a4f | ||
|
|
0bc32f27df | ||
|
|
6640e93ce9 | ||
|
|
d7aefe0cf0 | ||
|
|
187fe5b361 | ||
|
|
b31aea7bc5 | ||
|
c661baf058
|
|||
|
|
0fe0461340 | ||
|
|
d5394c9e92 | ||
|
|
42135fd26c | ||
|
|
38569f55c7 | ||
|
|
5ce03c2db3 | ||
|
|
1031b3eb79 | ||
|
|
fcdf4cd476 | ||
| 6268dffff8 | |||
| c10737bfd7 | |||
|
|
bd0cc69668 | ||
|
|
84fffac264 | ||
|
|
5bf968010e | ||
|
|
61bc095d01 | ||
|
|
e376f97547 | ||
|
|
f2428d3cb3 | ||
|
|
2fdac85d31 | ||
|
|
b731395689 | ||
|
|
07405e3466 | ||
|
|
fc0c76bd77 | ||
|
|
d209547968 | ||
| 632b9fc5ea | |||
| 702591b4ec | |||
|
|
c562746e5f | ||
|
|
c0443cbec2 | ||
|
|
0191bc3821 | ||
|
|
633bd42036 | ||
|
|
998ef8d834 | ||
|
|
c25b076ca9 | ||
|
|
f43379f365 | ||
|
|
d902c0acf4 | ||
|
|
58e678d72c | ||
|
|
cbc49669d0 | ||
|
|
78bb638fd6 | ||
|
|
7a61bae471 | ||
|
|
e1b992526e | ||
|
|
1b043838ea | ||
|
|
07e72294dc | ||
|
|
b6b37ee68b | ||
|
|
43cb1f1bff | ||
|
|
f7a67c72bf | ||
|
|
c5476d08fa | ||
|
|
8af92b1557 | ||
|
|
eaa826bb8a | ||
|
|
140b3c371d | ||
|
|
f158eaa29c | ||
|
|
c4b98ade53 | ||
|
|
f2e85306ca | ||
|
|
42b9de8360 | ||
|
|
6c244f3121 | ||
|
|
9f56213d2f | ||
|
|
fb2f7cf680 | ||
|
|
8fcdd24f84 | ||
|
|
aaafde4a7c | ||
|
|
2b23003556 | ||
|
|
5681062f01 | ||
|
|
d61bf212f5 | ||
|
|
2bd7c8d51e | ||
|
|
1e63cdbcda | ||
|
|
86d85f12be | ||
|
|
dd470d49ec | ||
|
|
95d8062b00 | ||
|
|
8f82399214 | ||
|
|
6247150e9c | ||
|
5266644725
|
|||
|
81d9e96552
|
|||
|
|
4ec9f06114 | ||
|
0033e9f6c0
|
|||
|
571652c314
|
|||
|
|
7ec233e18a | ||
|
|
13c9a12336 | ||
|
|
83d472ecd6 | ||
|
|
c21da6512a | ||
|
|
4b4374e0df | ||
|
|
407276a04d | ||
|
|
64f60905b4 | ||
|
|
9e6072fed2 | ||
|
|
a3e5c424fd | ||
|
|
6683a350aa | ||
|
|
05bfa9b546 | ||
|
|
735988decb | ||
|
|
d0580592be | ||
|
|
817076bdbf | ||
|
|
736236e9ca | ||
|
|
3f4114c51b | ||
|
|
5c2c493c56 | ||
|
|
2c383ebea1 | ||
|
|
91e73450cf | ||
|
|
e55798944e | ||
|
|
5ea11a5ad2 | ||
|
|
2a3383e9e6 | ||
|
|
e871703724 | ||
|
|
1ee367d7be | ||
|
|
bce536b9b4 | ||
|
|
7c9182e0b0 | ||
|
|
aa915d639d | ||
|
|
9489ebc7d6 | ||
|
2a5c525193
|
|||
|
9e2d981c60
|
|||
|
|
53dfe9e4f5 | ||
|
48e95fbdb0
|
|||
|
fd94d85edf
|
|||
|
f2d1a85afb
|
|||
|
0bdbcb8bab
|
|||
|
|
7b91a819be | ||
| bc89025924 | |||
|
|
16bcaef4c3 | ||
|
|
fcbfa451f2 | ||
|
|
559ce53ca4 | ||
|
|
ee2c5b58d7 | ||
|
|
d98d998106 | ||
|
212c45e070
|
|||
|
143fa9b6ed
|
|||
|
4849928288
|
|||
|
|
9248ee8868 | ||
|
|
1616d96732 | ||
| 0bbedd1600 | |||
|
|
c7e49644d8 | ||
|
010c903c74
|
|||
|
e4d12e3537
|
|||
|
051cc8384e
|
|||
|
49a94170d2
|
|||
|
|
42e8e37bd4 | ||
|
|
5d2c350ce2 | ||
|
|
85dc0362c1 | ||
|
|
01c06728eb | ||
|
|
257250714d | ||
|
|
3b769c3059 | ||
|
|
a7395ed45b | ||
|
|
ab07c7928f | ||
|
|
b0c0d15505 | ||
|
|
fcf50790da | ||
|
|
1e43654607 | ||
|
|
4fecbe820d | ||
|
|
763c9dfa6b | ||
|
9de5879786
|
|||
|
|
9396e7492c | ||
|
3ac3415178
|
|||
|
1aae1c59d0
|
|||
|
907e80a01c
|
|||
|
|
8a10b69716 | ||
|
|
1a3cf7edd6 | ||
|
|
76d0fc979b | ||
|
|
a42d8ece35 | ||
|
|
93377f53fc | ||
|
|
c853d74ba0 | ||
|
|
0b9f74f4f4 | ||
|
|
5da6baf828 | ||
|
5766945006
|
|||
|
a53d473b58
|
|||
|
|
d1207ad80e | ||
|
|
e2efe71b33 | ||
|
|
2aef6ed9c0 | ||
|
|
fcb6db0603 | ||
| 01b1136316 | |||
|
|
2512fe9e75 | ||
|
|
f89b5cd2ec | ||
|
|
ab284ed208 | ||
|
|
00a578657c | ||
|
|
38ce40ae7d | ||
| e1be6c7138 | |||
| 28539e60b0 | |||
|
adb11b3ed0
|
|||
|
|
f1e6dedd44 | ||
|
|
8ea1454c06 | ||
| 81b8d578f2 | |||
|
|
16b11db39c | ||
| 0d923cc920 | |||
| c523e93564 | |||
| d588798ea1 | |||
| a11f165f2a | |||
|
|
d4f487d554 | ||
|
|
93d5a0e532 | ||
|
|
00ddc462d2 | ||
|
|
5f4a74f8ba | ||
|
|
a8eff6fbd1 | ||
|
|
baa7367ebe | ||
|
|
69f8a34aac | ||
|
|
21b3a67988 | ||
|
|
d89574ce73 | ||
| ddeac6b9d9 | |||
| 17906ec0eb | |||
|
|
311c088d3d | ||
| a2584d6083 | |||
| 35bd7739c6 | |||
| 7f43c88a39 | |||
|
|
fc1c54a141 | ||
|
|
2af111c584 | ||
| c093cca8b1 | |||
|
|
2bb1b78ba4 | ||
| 3ab26172c4 | |||
| cdd45ce88b | |||
|
210a7d3136
|
|||
|
92ec64d80f
|
|||
|
ff37f71fdb
|
|||
|
6056341525
|
|||
|
|
075612f5bd | ||
| 1a87ed8210 | |||
|
|
c05ffeb16d | ||
| ee3710c5ed | |||
| 4327c4b1f7 | |||
| 492e56a098 | |||
| f0257a2784 | |||
| ec1ead89ab | |||
|
|
ae53e87aba | ||
|
|
939dd2320a | ||
|
|
2c8b73e2e2 | ||
|
|
eabc6212ea | ||
|
|
c120d6517f | ||
|
|
597ee1dad7 | ||
|
|
c4a901504d | ||
|
|
f5cc5d07fd | ||
|
|
8a0e6c921c | ||
|
|
bf1bff9ace | ||
|
|
06f24e988f | ||
|
|
ae327f545e | ||
|
|
35012b18c5 | ||
|
|
9688bad622 | ||
|
|
447b8d3372 | ||
|
|
01102cb9b0 | ||
|
|
934d1a6114 | ||
|
|
6f74c8cb77 | ||
|
|
63b9e619a4 | ||
|
|
82e28f26d7 | ||
|
|
ca9fd96baa | ||
|
|
39b22267d6 | ||
|
|
60d7984d66 | ||
|
|
33d219d2ac | ||
|
|
85a77e05af | ||
|
|
3dfeabcec6 | ||
|
|
673fdc443c | ||
|
|
2f6e5a7648 | ||
|
|
2cbe8e9517 | ||
|
|
2f0460d6ec | ||
|
|
37f4ed7770 | ||
|
|
e3104c61cb | ||
|
|
bc434ee8cb | ||
|
|
f4102b948e | ||
|
|
ed991de11a | ||
|
|
322e161064 | ||
|
|
1adc741cc2 | ||
|
|
4eff87bbf7 | ||
|
|
fc6970d08a | ||
|
|
f616c7e1c6 | ||
|
|
89ec749172 | ||
|
|
182f0f2c64 | ||
|
|
e3681495ce | ||
|
|
37415fa261 | ||
|
|
7243dbe763 | ||
|
|
0ff5c4bedd | ||
|
|
f047f89ad5 | ||
|
|
0eb0aa1d3b | ||
|
|
6019891591 | ||
|
|
615281601c | ||
|
|
82baf5d384 | ||
|
|
6fe93ecb7e | ||
|
|
b3222f3523 | ||
|
|
3b94863521 | ||
|
|
582dc8bf46 | ||
|
|
a9868fd275 | ||
|
|
218e56576a | ||
|
|
c50e79375a | ||
|
|
dcb8308f35 | ||
|
|
183b310696 | ||
|
|
c7d0c86d52 | ||
|
|
48225662b1 | ||
|
|
f53fc088ec | ||
|
|
05517fcbcd | ||
|
|
18af51b0a4 | ||
|
|
ede3da7a87 | ||
|
|
8e3327ef6a | ||
|
|
827f6daabc | ||
|
|
2567442321 | ||
|
|
9cf5478519 | ||
|
|
e5275311c2 | ||
|
|
21e4870e4c | ||
|
|
beba7c8d2e | ||
|
|
fe35313305 | ||
|
|
d7a8bbf40b | ||
|
|
f1893c596e | ||
|
|
6367c1ab4d | ||
|
|
9579887fc4 | ||
|
|
e29be2f140 | ||
|
|
2736b5d1ef | ||
|
|
ff52fb16b6 | ||
|
|
ccbf3867e1 | ||
|
|
f0de422c6e | ||
|
|
64cc19b252 | ||
|
|
26226009f0 | ||
|
|
d10e09da02 | ||
|
|
00a2e58fee | ||
|
|
b1cb45dfe6 | ||
|
|
a2951d1f05 | ||
|
|
c0b1e97602 | ||
|
|
71621a9dc4 | ||
|
|
b3ed2afebe | ||
|
|
704620baff | ||
|
|
8feb805167 | ||
|
|
065b32755a | ||
|
|
1b5f4bff2c | ||
|
|
8e1c5a485f | ||
| 5fa6c9db35 | |||
| 5482b9be2c | |||
|
|
7400273b0a | ||
|
|
0b7cdde4a0 | ||
|
|
d5382aec4f | ||
|
|
df484dc816 | ||
|
|
7ea4086807 | ||
|
|
b04bf6a951 | ||
| 7c33dcf630 | |||
| 5e65e21f0b | |||
| 53ca38ce53 | |||
|
|
398e3c1b91 | ||
| 508978d586 | |||
| e267481f71 | |||
|
|
193bee5ac8 | ||
| f58efa2871 | |||
| 6568b6d723 | |||
|
|
4b1b34d8a7 | ||
| 39c09f8565 | |||
|
|
275a77807e | ||
|
|
6443541a79 | ||
|
|
5eb6f7d307 | ||
|
|
bce2a66177 | ||
|
|
7602641909 | ||
|
|
54f3a261c5 | ||
|
|
906bac965f | ||
|
|
4ec1de6900 | ||
|
|
8ded131666 | ||
| 47b14f932e | |||
|
|
838ebb3f69 | ||
| c459724114 | |||
| b0c9d1164d | |||
| 7c51d88501 | |||
| 5b03cf826b | |||
| f305863616 | |||
| db5809d522 | |||
|
|
83df6f015c | ||
| e7231b0e13 | |||
|
|
cff60eb51c | ||
|
f914a312f5
|
|||
| 56ebb301ca | |||
|
|
a59df12595 | ||
|
|
5cc7fc6ccb | ||
|
|
55027cb630 | ||
|
|
036eba68e1 | ||
|
|
d34e0d9348 | ||
|
|
31765ce0ef | ||
|
|
9fe7cdca92 | ||
|
|
adc3502b6b | ||
|
|
95fe369648 | ||
|
|
01845a0cb7 | ||
|
|
708eaf4178 | ||
|
|
d629a58712 | ||
|
|
90886b63d6 | ||
|
|
084f89fa32 | ||
|
|
ceb3a095d8 | ||
|
|
1758275f11 | ||
|
|
e74e506ffe | ||
|
|
599a36466a | ||
|
|
613e128cab | ||
|
|
e4f8022b7a | ||
|
|
5603c41900 | ||
| a8a27c9b51 | |||
|
|
b70de5a4be | ||
|
|
b1fd07cd30 | ||
|
|
6ab2e02fe6 | ||
|
|
5535c5780c | ||
|
|
49e0a2c055 | ||
|
|
efbe53b6b4 | ||
|
5e074dad10
|
|||
|
d6a88896d0
|
|||
|
5c99f5f8bb
|
|||
|
e1faba0ff2
|
|||
|
ba2f406bc0
|
|||
|
9b6db4684a
|
|||
|
|
561fd41d5d | ||
|
|
ce9995dac7 | ||
|
|
0afaea9513 | ||
|
|
9b5c6e3164 | ||
|
|
e6ebec8c1e | ||
|
|
2551921ed6 | ||
|
|
e02575aad7 | ||
|
|
ff3502c87a | ||
|
|
017f9b2140 | ||
|
|
c80d3a6958 | ||
|
|
3ca1127685 | ||
|
|
18369da5bc | ||
|
|
e65100cdc8 | ||
|
|
6a1cb51c2f | ||
|
c4d93e492b
|
|||
|
c2f72f72ac
|
|||
|
721b6b2afa
|
|||
|
b6f011c669
|
|||
|
801607fc16
|
|||
|
01a4d33514
|
|||
|
e348ec74fd
|
|||
|
0458675608
|
|||
|
c61ffce0e9
|
|||
|
68a97dc980
|
|||
|
a07d167390
|
|||
|
|
a8721dcc69 | ||
|
|
68cf952ac6 | ||
|
|
e14d6a81fe | ||
|
|
a4912893a8 | ||
|
0adfb631ef
|
|||
|
b64ce1f67f
|
|||
|
e8e3b1595d
|
|||
|
f1427d5272
|
|||
|
|
bf6b87d65c | ||
|
|
0240997257 | ||
|
|
f1e341f0b9 | ||
|
a54acb8c42
|
|||
|
c6ede67589
|
|||
|
|
11176da5d8 | ||
|
|
0a604336c4 | ||
|
|
be9df7649f | ||
|
|
63fb923995 | ||
|
|
3afe40083d | ||
|
|
9d4767539c | ||
|
ac9bba8b5b
|
|||
|
80c46bea7f
|
|||
|
|
614f694777 | ||
|
|
1072d7b449 | ||
|
1b70596735
|
|||
|
|
61eebc9fbd | ||
|
b05909969f
|
|||
|
bd89ce7cc9
|
|||
|
130613b717
|
|||
|
b3c1f39a0e
|
|||
|
97c807cd33
|
|||
|
aede5f71ec
|
|||
|
786770f56a
|
|||
|
|
74d4f00784 | ||
|
d61c4235dc
|
|||
|
e8794b8c79
|
|||
|
552da005dc
|
|||
|
|
51452d2e68 | ||
|
5c5484b4d2
|
|||
|
|
684cb5a376 | ||
| 649d50812b | |||
| 2502989ca2 | |||
| ba7cc9168e | |||
| dc0d9fe038 | |||
| 0e6c6937cd | |||
| d839c53642 |
15
.github/dependabot.yml
vendored
Normal file
15
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
# To get started with Dependabot version updates, you'll need to specify which
|
||||
# package ecosystems to update and where the package manifests are located.
|
||||
# Please see the documentation for all configuration options:
|
||||
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
||||
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "gomod"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "npm"
|
||||
directory: "/web/frontend"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
331
.github/workflows/Release.yml
vendored
331
.github/workflows/Release.yml
vendored
@@ -1,331 +0,0 @@
|
||||
# See: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions
|
||||
|
||||
# Workflow name
|
||||
name: Release
|
||||
|
||||
# Run on tag push
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '**'
|
||||
|
||||
jobs:
|
||||
|
||||
#
|
||||
# Build on AlmaLinux 8.5 using golang-1.18.2
|
||||
#
|
||||
AlmaLinux-RPM-build:
|
||||
runs-on: ubuntu-latest
|
||||
# See: https://hub.docker.com/_/almalinux
|
||||
container: almalinux:8.5
|
||||
# The job outputs link to the outputs of the 'rpmrename' step
|
||||
# Only job outputs can be used in child jobs
|
||||
outputs:
|
||||
rpm : ${{steps.rpmrename.outputs.RPM}}
|
||||
srpm : ${{steps.rpmrename.outputs.SRPM}}
|
||||
steps:
|
||||
|
||||
# Use dnf to install development packages
|
||||
- name: Install development packages
|
||||
run: |
|
||||
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
|
||||
dnf --assumeyes install wget openssl-devel diffutils delve which npm
|
||||
dnf --assumeyes install 'dnf-command(builddep)'
|
||||
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
|
||||
# Use dnf to install build dependencies
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
|
||||
rpm -i go*.rpm
|
||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
||||
#dnf --assumeyes builddep build/package/cc-backend.spec
|
||||
|
||||
- name: RPM build ClusterCockpit
|
||||
id: rpmbuild
|
||||
run: make RPM
|
||||
|
||||
# AlmaLinux 8.5 is a derivate of RedHat Enterprise Linux 8 (UBI8),
|
||||
# so the created RPM both contain the substring 'el8' in the RPM file names
|
||||
# This step replaces the substring 'el8' to 'alma85'. It uses the move operation
|
||||
# because it is unclear whether the default AlmaLinux 8.5 container contains the
|
||||
# 'rename' command. This way we also get the new names for output.
|
||||
- name: Rename RPMs (s/el8/alma85/)
|
||||
id: rpmrename
|
||||
run: |
|
||||
OLD_RPM="${{steps.rpmbuild.outputs.RPM}}"
|
||||
OLD_SRPM="${{steps.rpmbuild.outputs.SRPM}}"
|
||||
NEW_RPM="${OLD_RPM/el8/alma85}"
|
||||
NEW_SRPM=${OLD_SRPM/el8/alma85}
|
||||
mv "${OLD_RPM}" "${NEW_RPM}"
|
||||
mv "${OLD_SRPM}" "${NEW_SRPM}"
|
||||
echo "::set-output name=SRPM::${NEW_SRPM}"
|
||||
echo "::set-output name=RPM::${NEW_RPM}"
|
||||
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
- name: Save RPM as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend RPM for AlmaLinux 8.5
|
||||
path: ${{ steps.rpmrename.outputs.RPM }}
|
||||
- name: Save SRPM as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend SRPM for AlmaLinux 8.5
|
||||
path: ${{ steps.rpmrename.outputs.SRPM }}
|
||||
|
||||
#
|
||||
# Build on UBI 8 using golang-1.18.2
|
||||
#
|
||||
UBI-8-RPM-build:
|
||||
runs-on: ubuntu-latest
|
||||
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
|
||||
container: registry.access.redhat.com/ubi8/ubi:8.5-226.1645809065
|
||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||
outputs:
|
||||
rpm : ${{steps.rpmbuild.outputs.RPM}}
|
||||
srpm : ${{steps.rpmbuild.outputs.SRPM}}
|
||||
steps:
|
||||
|
||||
# Use dnf to install development packages
|
||||
- name: Install development packages
|
||||
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
|
||||
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
|
||||
# Use dnf to install build dependencies
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
|
||||
rpm -i go*.rpm
|
||||
dnf --assumeyes --disableplugin=subscription-manager install npm
|
||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
||||
#dnf --assumeyes builddep build/package/cc-backend.spec
|
||||
|
||||
- name: RPM build ClusterCockpit
|
||||
id: rpmbuild
|
||||
run: make RPM
|
||||
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
- name: Save RPM as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend RPM for UBI 8
|
||||
path: ${{ steps.rpmbuild.outputs.RPM }}
|
||||
- name: Save SRPM as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend SRPM for UBI 8
|
||||
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
||||
|
||||
#
|
||||
# Build on Ubuntu 20.04 using official go 1.19.1 package
|
||||
#
|
||||
Ubuntu-focal-build:
|
||||
runs-on: ubuntu-latest
|
||||
container: ubuntu:20.04
|
||||
# The job outputs link to the outputs of the 'debrename' step
|
||||
# Only job outputs can be used in child jobs
|
||||
outputs:
|
||||
deb : ${{steps.debrename.outputs.DEB}}
|
||||
steps:
|
||||
# Use apt to install development packages
|
||||
- name: Install development packages
|
||||
run: |
|
||||
apt update && apt --assume-yes upgrade
|
||||
apt --assume-yes install build-essential sed git wget bash
|
||||
apt --assume-yes install npm
|
||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
# Use official golang package
|
||||
- name: Install Golang
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
|
||||
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
- name: DEB build ClusterCockpit
|
||||
id: dpkg-build
|
||||
run: |
|
||||
ls -la
|
||||
pwd
|
||||
env
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
git config --global --add safe.directory $(pwd)
|
||||
make DEB
|
||||
- name: Rename DEB (add '_ubuntu20.04')
|
||||
id: debrename
|
||||
run: |
|
||||
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
|
||||
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb"
|
||||
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
|
||||
echo "::set-output name=DEB::${NEW_DEB_FILE}"
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
- name: Save DEB as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend DEB for Ubuntu 20.04
|
||||
path: ${{ steps.debrename.outputs.DEB }}
|
||||
|
||||
#
|
||||
# Build on Ubuntu 20.04 using official go 1.19.1 package
|
||||
#
|
||||
Ubuntu-jammy-build:
|
||||
runs-on: ubuntu-latest
|
||||
container: ubuntu:22.04
|
||||
# The job outputs link to the outputs of the 'debrename' step
|
||||
# Only job outputs can be used in child jobs
|
||||
outputs:
|
||||
deb : ${{steps.debrename.outputs.DEB}}
|
||||
steps:
|
||||
# Use apt to install development packages
|
||||
- name: Install development packages
|
||||
run: |
|
||||
apt update && apt --assume-yes upgrade
|
||||
apt --assume-yes install build-essential sed git wget bash npm
|
||||
npm install --global yarn rollup svelte rollup-plugin-svelte
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
# Use official golang package
|
||||
- name: Install Golang
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
|
||||
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
- name: DEB build ClusterCockpit
|
||||
id: dpkg-build
|
||||
run: |
|
||||
ls -la
|
||||
pwd
|
||||
env
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
git config --global --add safe.directory $(pwd)
|
||||
make DEB
|
||||
- name: Rename DEB (add '_ubuntu22.04')
|
||||
id: debrename
|
||||
run: |
|
||||
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
|
||||
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu22.04.deb"
|
||||
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
|
||||
echo "::set-output name=DEB::${NEW_DEB_FILE}"
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
- name: Save DEB as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-backend DEB for Ubuntu 22.04
|
||||
path: ${{ steps.debrename.outputs.DEB }}
|
||||
|
||||
#
|
||||
# Create release with fresh RPMs
|
||||
#
|
||||
Release:
|
||||
runs-on: ubuntu-latest
|
||||
# We need the RPMs, so add dependency
|
||||
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build, Ubuntu-jammy-build]
|
||||
|
||||
steps:
|
||||
# See: https://github.com/actions/download-artifact
|
||||
- name: Download AlmaLinux 8.5 RPM
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend RPM for AlmaLinux 8.5
|
||||
- name: Download AlmaLinux 8.5 SRPM
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend SRPM for AlmaLinux 8.5
|
||||
|
||||
- name: Download UBI 8 RPM
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend RPM for UBI 8
|
||||
- name: Download UBI 8 SRPM
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend SRPM for UBI 8
|
||||
|
||||
- name: Download Ubuntu 20.04 DEB
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend DEB for Ubuntu 20.04
|
||||
|
||||
- name: Download Ubuntu 22.04 DEB
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-backend DEB for Ubuntu 22.04
|
||||
|
||||
# The download actions do not publish the name of the downloaded file,
|
||||
# so we re-use the job outputs of the parent jobs. The files are all
|
||||
# downloaded to the current folder.
|
||||
# The gh-release action afterwards does not accept file lists but all
|
||||
# files have to be listed at 'files'. The step creates one output per
|
||||
# RPM package (2 per distro)
|
||||
- name: Set RPM variables
|
||||
id: files
|
||||
run: |
|
||||
ALMA_85_RPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.rpm}}")
|
||||
ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}")
|
||||
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
|
||||
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
|
||||
U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}")
|
||||
U_2204_DEB=$(basename "${{ needs.Ubuntu-jammy-build.outputs.deb}}")
|
||||
echo "ALMA_85_RPM::${ALMA_85_RPM}"
|
||||
echo "ALMA_85_SRPM::${ALMA_85_SRPM}"
|
||||
echo "UBI_8_RPM::${UBI_8_RPM}"
|
||||
echo "UBI_8_SRPM::${UBI_8_SRPM}"
|
||||
echo "U_2004_DEB::${U_2004_DEB}"
|
||||
echo "U_2204_DEB::${U_2204_DEB}"
|
||||
echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}"
|
||||
echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}"
|
||||
echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}"
|
||||
echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}"
|
||||
echo "::set-output name=U_2004_DEB::${U_2004_DEB}"
|
||||
echo "::set-output name=U_2204_DEB::${U_2204_DEB}"
|
||||
|
||||
# See: https://github.com/softprops/action-gh-release
|
||||
- name: Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
with:
|
||||
name: cc-backend-${{github.ref_name}}
|
||||
files: |
|
||||
${{ steps.files.outputs.ALMA_85_RPM }}
|
||||
${{ steps.files.outputs.ALMA_85_SRPM }}
|
||||
${{ steps.files.outputs.UBI_8_RPM }}
|
||||
${{ steps.files.outputs.UBI_8_SRPM }}
|
||||
${{ steps.files.outputs.U_2004_DEB }}
|
||||
${{ steps.files.outputs.U_2204_DEB }}
|
||||
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -7,7 +7,7 @@ jobs:
|
||||
- name: Install Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.x
|
||||
go-version: 1.25.x
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
- name: Build, Vet & Test
|
||||
|
||||
26
.gitignore
vendored
26
.gitignore
vendored
@@ -1,19 +1,29 @@
|
||||
/cc-backend
|
||||
|
||||
/var/job-archive
|
||||
/var/*.db
|
||||
/var/machine-state
|
||||
|
||||
/.env
|
||||
/config.json
|
||||
/uiConfig.json
|
||||
|
||||
/var/job-archive
|
||||
/var/machine-state
|
||||
/var/job.db-shm
|
||||
/var/job.db-wal
|
||||
/var/*.db
|
||||
/var/*.txt
|
||||
|
||||
/var/checkpoints*
|
||||
|
||||
migrateTimestamps.pl
|
||||
test_ccms_write_api.sh
|
||||
|
||||
/web/frontend/public/build
|
||||
/web/frontend/node_modules
|
||||
/.vscode/*
|
||||
|
||||
/archive-migration
|
||||
/archive-manager
|
||||
var/job.db-shm
|
||||
var/job.db-wal
|
||||
|
||||
/internal/repository/testdata/job.db-shm
|
||||
/internal/repository/testdata/job.db-wal
|
||||
|
||||
/.vscode/*
|
||||
dist/
|
||||
*.db
|
||||
|
||||
@@ -34,19 +34,6 @@ builds:
|
||||
main: ./tools/archive-manager
|
||||
tags:
|
||||
- static_build
|
||||
- env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
- linux
|
||||
goarch:
|
||||
- amd64
|
||||
goamd64:
|
||||
- v3
|
||||
id: "archive-migration"
|
||||
binary: archive-migration
|
||||
main: ./tools/archive-migration
|
||||
tags:
|
||||
- static_build
|
||||
- env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
@@ -70,7 +57,7 @@ archives:
|
||||
{{- else }}{{ .Arch }}{{ end }}
|
||||
{{- if .Arm }}v{{ .Arm }}{{ end }}
|
||||
checksum:
|
||||
name_template: 'checksums.txt'
|
||||
name_template: "checksums.txt"
|
||||
snapshot:
|
||||
name_template: "{{ incpatch .Version }}-next"
|
||||
changelog:
|
||||
@@ -100,7 +87,7 @@ changelog:
|
||||
release:
|
||||
draft: false
|
||||
footer: |
|
||||
Supports job archive version 1 and database version 6.
|
||||
Supports job archive version 2 and database version 8.
|
||||
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md) for further details on breaking changes.
|
||||
|
||||
# vim: set ts=2 sw=2 tw=0 fo=cnqoj
|
||||
|
||||
35
Makefile
35
Makefile
@@ -2,7 +2,7 @@ TARGET = ./cc-backend
|
||||
VAR = ./var
|
||||
CFG = config.json .env
|
||||
FRONTEND = ./web/frontend
|
||||
VERSION = 1.3.1
|
||||
VERSION = 1.4.4
|
||||
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
||||
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
||||
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
||||
@@ -22,13 +22,23 @@ SVELTE_COMPONENTS = status \
|
||||
header
|
||||
|
||||
SVELTE_TARGETS = $(addprefix $(FRONTEND)/public/build/,$(addsuffix .js, $(SVELTE_COMPONENTS)))
|
||||
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/*.js) \
|
||||
$(wildcard $(FRONTEND)/src/filters/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/plots/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/joblist/*.svelte)
|
||||
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/*.js) \
|
||||
$(wildcard $(FRONTEND)/src/analysis/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/config/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/config/admin/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/config/user/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/*.js) \
|
||||
$(wildcard $(FRONTEND)/src/generic/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/filters/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/plots/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/joblist/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/helper/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/select/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/header/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/job/*.svelte)
|
||||
|
||||
.PHONY: clean distclean test tags frontend $(TARGET)
|
||||
.PHONY: clean distclean test tags frontend swagger graphql $(TARGET)
|
||||
|
||||
.NOTPARALLEL:
|
||||
|
||||
@@ -40,6 +50,15 @@ frontend:
|
||||
$(info ===> BUILD frontend)
|
||||
cd web/frontend && npm install && npm run build
|
||||
|
||||
swagger:
|
||||
$(info ===> GENERATE swagger)
|
||||
@go run github.com/swaggo/swag/cmd/swag init --parseDependency -d ./internal/api -g rest.go -o ./api
|
||||
@mv ./api/docs.go ./internal/api/docs.go
|
||||
|
||||
graphql:
|
||||
$(info ===> GENERATE graphql)
|
||||
@go run github.com/99designs/gqlgen
|
||||
|
||||
clean:
|
||||
$(info ===> CLEAN)
|
||||
@go clean
|
||||
@@ -63,7 +82,7 @@ tags:
|
||||
@ctags -R
|
||||
|
||||
$(VAR):
|
||||
@mkdir $(VAR)
|
||||
@mkdir -p $(VAR)
|
||||
|
||||
config.json:
|
||||
$(info ===> Initialize config.json file)
|
||||
|
||||
11
README.md
11
README.md
@@ -1,5 +1,8 @@
|
||||
# NOTE
|
||||
|
||||
While we do our best to keep the master branch in a usable state, there is no guarantee the master branch works.
|
||||
Please do not use it for production!
|
||||
|
||||
Please have a look at the [Release
|
||||
Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md)
|
||||
for breaking changes!
|
||||
@@ -65,7 +68,7 @@ cd ./cc-backend
|
||||
./startDemo.sh
|
||||
```
|
||||
|
||||
You can also try the demo using the lates release binary.
|
||||
You can also try the demo using the latest release binary.
|
||||
Create a folder and put the release binary `cc-backend` into this folder.
|
||||
Execute the following steps:
|
||||
|
||||
@@ -88,7 +91,9 @@ Analysis, Systems and Status views).
|
||||
There is a Makefile to automate the build of cc-backend. The Makefile supports
|
||||
the following targets:
|
||||
|
||||
* `make`: Initialize `var` directory and build svelte frontend and backend binary. Note that there is no proper prerequesite handling. Any change of frontend source files will result in a complete rebuild.
|
||||
* `make`: Initialize `var` directory and build svelte frontend and backend
|
||||
binary. Note that there is no proper prerequisite handling. Any change of
|
||||
frontend source files will result in a complete rebuild.
|
||||
* `make clean`: Clean go build cache and remove binary.
|
||||
* `make test`: Run the tests that are also run in the GitHub workflow setup.
|
||||
|
||||
@@ -147,8 +152,6 @@ contains Go packages that can be used by other projects.
|
||||
Additional command line helper tools.
|
||||
* [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
|
||||
Commands for getting infos about and existing job archive.
|
||||
* [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
|
||||
Tool to migrate from previous to current job archive version.
|
||||
* [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
|
||||
Tool to convert external pubkey for use in `cc-backend`.
|
||||
* [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
|
||||
|
||||
@@ -1,11 +1,47 @@
|
||||
# `cc-backend` version 1.3.1
|
||||
# `cc-backend` version 1.4.4
|
||||
|
||||
Supports job archive version 1 and database version 7.
|
||||
Supports job archive version 2 and database version 8.
|
||||
|
||||
This is a bugfix release of `cc-backend`, the API backend and frontend
|
||||
This is a bug fix release of `cc-backend`, the API backend and frontend
|
||||
implementation of ClusterCockpit.
|
||||
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
|
||||
|
||||
## Breaking changes
|
||||
|
||||
None
|
||||
The option `apiAllowedIPs` is now a required configuration attribute in
|
||||
`config.json`. This option restricts access to the admin API.
|
||||
|
||||
To retain the previous behavior that the API is per default accessible from
|
||||
everywhere set:
|
||||
|
||||
```json
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
]
|
||||
```
|
||||
|
||||
## Breaking changes for minor release 1.4.x
|
||||
|
||||
- You need to perform a database migration. Depending on your database size the
|
||||
migration might require several hours!
|
||||
- You need to adapt the `cluster.json` configuration files in the job-archive,
|
||||
add new required attributes to the metric list and after that edit
|
||||
`./job-archive/version.txt` to version 2. Only metrics that have the footprint
|
||||
attribute set can be filtered and show up in the footprint UI and polar plot.
|
||||
- Continuous scrolling is default now in all job lists. You can change this back
|
||||
to paging globally, also every user can configure to use paging or continuous
|
||||
scrolling individually.
|
||||
- Tags have a scope now. Existing tags will get global scope in the database
|
||||
migration.
|
||||
|
||||
## New features
|
||||
|
||||
- Enable to delete tags from the web interface
|
||||
|
||||
## Known issues
|
||||
|
||||
- Currently energy footprint metrics of type energy are ignored for calculating
|
||||
total energy.
|
||||
- Resampling for running jobs only works with cc-metric-store
|
||||
- With energy footprint metrics of type power the unit is ignored and it is
|
||||
assumed the metric has the unit Watt.
|
||||
|
||||
@@ -4,138 +4,216 @@ scalar Any
|
||||
scalar NullableFloat
|
||||
scalar MetricScope
|
||||
scalar JobState
|
||||
scalar SchedulerState
|
||||
scalar MonitoringState
|
||||
|
||||
type Node {
|
||||
id: ID!
|
||||
hostname: String!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
jobsRunning: Int!
|
||||
cpusAllocated: Int
|
||||
memoryAllocated: Int
|
||||
gpusAllocated: Int
|
||||
schedulerState: SchedulerState!
|
||||
healthState: MonitoringState!
|
||||
metaData: Any
|
||||
}
|
||||
|
||||
type NodeStates {
|
||||
state: String!
|
||||
count: Int!
|
||||
}
|
||||
|
||||
type NodeStatesTimed {
|
||||
state: String!
|
||||
type: String!
|
||||
count: Int!
|
||||
time: Int!
|
||||
}
|
||||
|
||||
type Job {
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
user: String!
|
||||
project: String!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
startTime: Time!
|
||||
duration: Int!
|
||||
walltime: Int!
|
||||
numNodes: Int!
|
||||
numHWThreads: Int!
|
||||
numAcc: Int!
|
||||
SMT: Int!
|
||||
exclusive: Int!
|
||||
partition: String!
|
||||
arrayJobId: Int!
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
user: String!
|
||||
project: String!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
startTime: Time!
|
||||
duration: Int!
|
||||
walltime: Int!
|
||||
numNodes: Int!
|
||||
numHWThreads: Int!
|
||||
numAcc: Int!
|
||||
energy: Float!
|
||||
SMT: Int!
|
||||
shared: String!
|
||||
partition: String!
|
||||
arrayJobId: Int!
|
||||
monitoringStatus: Int!
|
||||
state: JobState!
|
||||
tags: [Tag!]!
|
||||
resources: [Resource!]!
|
||||
concurrentJobs: JobLinkResultList
|
||||
|
||||
memUsedMax: Float
|
||||
flopsAnyAvg: Float
|
||||
memBwAvg: Float
|
||||
loadAvg: Float
|
||||
|
||||
metaData: Any
|
||||
userData: User
|
||||
state: JobState!
|
||||
tags: [Tag!]!
|
||||
resources: [Resource!]!
|
||||
concurrentJobs: JobLinkResultList
|
||||
footprint: [FootprintValue]
|
||||
energyFootprint: [EnergyFootprintValue]
|
||||
metaData: Any
|
||||
userData: User
|
||||
}
|
||||
|
||||
type JobLink {
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
}
|
||||
|
||||
type Cluster {
|
||||
name: String!
|
||||
partitions: [String!]! # Slurm partitions
|
||||
metricConfig: [MetricConfig!]!
|
||||
subClusters: [SubCluster!]! # Hardware partitions/subclusters
|
||||
name: String!
|
||||
partitions: [String!]! # Slurm partitions
|
||||
subClusters: [SubCluster!]! # Hardware partitions/subclusters
|
||||
}
|
||||
|
||||
type SubCluster {
|
||||
name: String!
|
||||
nodes: String!
|
||||
numberOfNodes: Int!
|
||||
processorType: String!
|
||||
socketsPerNode: Int!
|
||||
coresPerSocket: Int!
|
||||
threadsPerCore: Int!
|
||||
flopRateScalar: MetricValue!
|
||||
flopRateSimd: MetricValue!
|
||||
name: String!
|
||||
nodes: String!
|
||||
numberOfNodes: Int!
|
||||
processorType: String!
|
||||
socketsPerNode: Int!
|
||||
coresPerSocket: Int!
|
||||
threadsPerCore: Int!
|
||||
flopRateScalar: MetricValue!
|
||||
flopRateSimd: MetricValue!
|
||||
memoryBandwidth: MetricValue!
|
||||
topology: Topology!
|
||||
topology: Topology!
|
||||
metricConfig: [MetricConfig!]!
|
||||
footprint: [String!]!
|
||||
}
|
||||
|
||||
type FootprintValue {
|
||||
name: String!
|
||||
stat: String!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type EnergyFootprintValue {
|
||||
hardware: String!
|
||||
metric: String!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type MetricValue {
|
||||
name: String
|
||||
unit: Unit!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type Topology {
|
||||
node: [Int!]
|
||||
socket: [[Int!]!]
|
||||
node: [Int!]
|
||||
socket: [[Int!]!]
|
||||
memoryDomain: [[Int!]!]
|
||||
die: [[Int!]!]
|
||||
core: [[Int!]!]
|
||||
die: [[Int!]!]
|
||||
core: [[Int!]!]
|
||||
accelerators: [Accelerator!]
|
||||
}
|
||||
|
||||
type Accelerator {
|
||||
id: String!
|
||||
type: String!
|
||||
id: String!
|
||||
type: String!
|
||||
model: String!
|
||||
}
|
||||
|
||||
type SubClusterConfig {
|
||||
name: String!
|
||||
peak: Float
|
||||
normal: Float
|
||||
name: String!
|
||||
peak: Float
|
||||
normal: Float
|
||||
caution: Float
|
||||
alert: Float
|
||||
remove: Boolean
|
||||
alert: Float
|
||||
remove: Boolean
|
||||
}
|
||||
|
||||
type MetricConfig {
|
||||
name: String!
|
||||
unit: Unit!
|
||||
scope: MetricScope!
|
||||
name: String!
|
||||
unit: Unit!
|
||||
scope: MetricScope!
|
||||
aggregation: String!
|
||||
timestep: Int!
|
||||
peak: Float!
|
||||
normal: Float
|
||||
timestep: Int!
|
||||
peak: Float!
|
||||
normal: Float
|
||||
caution: Float!
|
||||
alert: Float!
|
||||
alert: Float!
|
||||
lowerIsBetter: Boolean
|
||||
subClusters: [SubClusterConfig!]!
|
||||
}
|
||||
|
||||
type Tag {
|
||||
id: ID!
|
||||
id: ID!
|
||||
type: String!
|
||||
name: String!
|
||||
scope: String!
|
||||
}
|
||||
|
||||
type Resource {
|
||||
hostname: String!
|
||||
hwthreads: [Int!]
|
||||
accelerators: [String!]
|
||||
hostname: String!
|
||||
hwthreads: [Int!]
|
||||
accelerators: [String!]
|
||||
configuration: String
|
||||
}
|
||||
|
||||
type JobMetricWithName {
|
||||
name: String!
|
||||
scope: MetricScope!
|
||||
name: String!
|
||||
scope: MetricScope!
|
||||
metric: JobMetric!
|
||||
}
|
||||
|
||||
type JobMetric {
|
||||
unit: Unit
|
||||
timestep: Int!
|
||||
series: [Series!]
|
||||
unit: Unit
|
||||
timestep: Int!
|
||||
series: [Series!]
|
||||
statisticsSeries: StatsSeries
|
||||
}
|
||||
|
||||
type Series {
|
||||
hostname: String!
|
||||
id: String
|
||||
hostname: String!
|
||||
id: String
|
||||
statistics: MetricStatistics
|
||||
data: [NullableFloat!]!
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type StatsSeries {
|
||||
mean: [NullableFloat!]!
|
||||
median: [NullableFloat!]!
|
||||
min: [NullableFloat!]!
|
||||
max: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type NamedStatsWithScope {
|
||||
name: String!
|
||||
scope: MetricScope!
|
||||
stats: [ScopedStats!]!
|
||||
}
|
||||
|
||||
type ScopedStats {
|
||||
hostname: String!
|
||||
id: String
|
||||
data: MetricStatistics!
|
||||
}
|
||||
|
||||
type JobStats {
|
||||
id: Int!
|
||||
jobId: String!
|
||||
startTime: Int!
|
||||
duration: Int!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
numNodes: Int!
|
||||
numHWThreads: Int
|
||||
numAccelerators: Int
|
||||
stats: [NamedStats!]!
|
||||
}
|
||||
|
||||
type NamedStats {
|
||||
name: String!
|
||||
data: MetricStatistics!
|
||||
}
|
||||
|
||||
type Unit {
|
||||
@@ -149,20 +227,14 @@ type MetricStatistics {
|
||||
max: Float!
|
||||
}
|
||||
|
||||
type StatsSeries {
|
||||
mean: [NullableFloat!]!
|
||||
min: [NullableFloat!]!
|
||||
max: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type MetricFootprints {
|
||||
metric: String!
|
||||
data: [NullableFloat!]!
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type Footprints {
|
||||
timeWeights: TimeWeights!
|
||||
metrics: [MetricFootprints!]!
|
||||
metrics: [MetricFootprints!]!
|
||||
}
|
||||
|
||||
type TimeWeights {
|
||||
@@ -171,87 +243,207 @@ type TimeWeights {
|
||||
coreHours: [NullableFloat!]!
|
||||
}
|
||||
|
||||
enum Aggregate { USER, PROJECT, CLUSTER }
|
||||
enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS }
|
||||
enum Aggregate {
|
||||
USER
|
||||
PROJECT
|
||||
CLUSTER
|
||||
SUBCLUSTER
|
||||
}
|
||||
enum SortByAggregate {
|
||||
TOTALWALLTIME
|
||||
TOTALJOBS
|
||||
TOTALUSERS
|
||||
TOTALNODES
|
||||
TOTALNODEHOURS
|
||||
TOTALCORES
|
||||
TOTALCOREHOURS
|
||||
TOTALACCS
|
||||
TOTALACCHOURS
|
||||
}
|
||||
|
||||
type NodeMetrics {
|
||||
host: String!
|
||||
host: String!
|
||||
subCluster: String!
|
||||
metrics: [JobMetricWithName!]!
|
||||
metrics: [JobMetricWithName!]!
|
||||
}
|
||||
|
||||
type NodesResultList {
|
||||
items: [NodeMetrics!]!
|
||||
offset: Int
|
||||
limit: Int
|
||||
count: Int
|
||||
totalNodes: Int
|
||||
hasNextPage: Boolean
|
||||
}
|
||||
|
||||
type ClusterSupport {
|
||||
cluster: String!
|
||||
subClusters: [String!]!
|
||||
}
|
||||
|
||||
type GlobalMetricListItem {
|
||||
name: String!
|
||||
unit: Unit!
|
||||
scope: MetricScope!
|
||||
footprint: String
|
||||
availability: [ClusterSupport!]!
|
||||
}
|
||||
|
||||
type Count {
|
||||
name: String!
|
||||
name: String!
|
||||
count: Int!
|
||||
}
|
||||
|
||||
type User {
|
||||
username: String!
|
||||
name: String!
|
||||
email: String!
|
||||
name: String!
|
||||
email: String!
|
||||
}
|
||||
|
||||
input MetricStatItem {
|
||||
metricName: String!
|
||||
range: FloatRange!
|
||||
}
|
||||
|
||||
type Query {
|
||||
clusters: [Cluster!]! # List of all clusters
|
||||
tags: [Tag!]! # List of all tags
|
||||
clusters: [Cluster!]! # List of all clusters
|
||||
tags: [Tag!]! # List of all tags
|
||||
globalMetrics: [GlobalMetricListItem!]!
|
||||
|
||||
user(username: String!): User
|
||||
allocatedNodes(cluster: String!): [Count!]!
|
||||
|
||||
## Node Queries New
|
||||
node(id: ID!): Node
|
||||
nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
|
||||
nodeStates(filter: [NodeFilter!]): [NodeStates!]!
|
||||
nodeStatesTimed(filter: [NodeFilter!]): [NodeStatesTimed!]!
|
||||
|
||||
job(id: ID!): Job
|
||||
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
|
||||
jobMetrics(
|
||||
id: ID!
|
||||
metrics: [String!]
|
||||
scopes: [MetricScope!]
|
||||
resolution: Int
|
||||
): [JobMetricWithName!]!
|
||||
|
||||
jobStats(id: ID!, metrics: [String!]): [NamedStats!]!
|
||||
|
||||
scopedJobStats(
|
||||
id: ID!
|
||||
metrics: [String!]
|
||||
scopes: [MetricScope!]
|
||||
): [NamedStatsWithScope!]!
|
||||
|
||||
jobs(
|
||||
filter: [JobFilter!]
|
||||
page: PageRequest
|
||||
order: OrderByInput
|
||||
): JobResultList!
|
||||
|
||||
jobsStatistics(
|
||||
filter: [JobFilter!]
|
||||
metrics: [String!]
|
||||
page: PageRequest
|
||||
sortBy: SortByAggregate
|
||||
groupBy: Aggregate
|
||||
numDurationBins: String
|
||||
numMetricBins: Int
|
||||
): [JobsStatistics!]!
|
||||
|
||||
jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]!
|
||||
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
|
||||
|
||||
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
|
||||
jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate): [JobsStatistics!]!
|
||||
rooflineHeatmap(
|
||||
filter: [JobFilter!]!
|
||||
rows: Int!
|
||||
cols: Int!
|
||||
minX: Float!
|
||||
minY: Float!
|
||||
maxX: Float!
|
||||
maxY: Float!
|
||||
): [[Float!]!]!
|
||||
|
||||
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
|
||||
nodeMetrics(
|
||||
cluster: String!
|
||||
nodes: [String!]
|
||||
scopes: [MetricScope!]
|
||||
metrics: [String!]
|
||||
from: Time!
|
||||
to: Time!
|
||||
): [NodeMetrics!]!
|
||||
|
||||
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
|
||||
nodeMetricsList(
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
nodeFilter: String!
|
||||
scopes: [MetricScope!]
|
||||
metrics: [String!]
|
||||
from: Time!
|
||||
to: Time!
|
||||
page: PageRequest
|
||||
resolution: Int
|
||||
): NodesResultList!
|
||||
}
|
||||
|
||||
type Mutation {
|
||||
createTag(type: String!, name: String!): Tag!
|
||||
createTag(type: String!, name: String!, scope: String!): Tag!
|
||||
deleteTag(id: ID!): ID!
|
||||
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
removeTagFromList(tagIds: [ID!]!): [Int!]!
|
||||
|
||||
updateConfiguration(name: String!, value: String!): String
|
||||
}
|
||||
|
||||
type IntRangeOutput { from: Int!, to: Int! }
|
||||
type TimeRangeOutput { from: Time!, to: Time! }
|
||||
type IntRangeOutput {
|
||||
from: Int!
|
||||
to: Int!
|
||||
}
|
||||
type TimeRangeOutput {
|
||||
range: String
|
||||
from: Time!
|
||||
to: Time!
|
||||
}
|
||||
|
||||
input NodeFilter {
|
||||
hostname: StringInput
|
||||
cluster: StringInput
|
||||
subcluster: StringInput
|
||||
schedulerState: SchedulerState
|
||||
healthState: MonitoringState
|
||||
timeStart: Int
|
||||
}
|
||||
|
||||
input JobFilter {
|
||||
tags: [ID!]
|
||||
jobId: StringInput
|
||||
arrayJobId: Int
|
||||
user: StringInput
|
||||
project: StringInput
|
||||
jobName: StringInput
|
||||
cluster: StringInput
|
||||
partition: StringInput
|
||||
duration: IntRange
|
||||
tags: [ID!]
|
||||
dbId: [ID!]
|
||||
jobId: StringInput
|
||||
arrayJobId: Int
|
||||
user: StringInput
|
||||
project: StringInput
|
||||
jobName: StringInput
|
||||
cluster: StringInput
|
||||
partition: StringInput
|
||||
duration: IntRange
|
||||
energy: FloatRange
|
||||
|
||||
minRunningFor: Int
|
||||
|
||||
numNodes: IntRange
|
||||
numNodes: IntRange
|
||||
numAccelerators: IntRange
|
||||
numHWThreads: IntRange
|
||||
numHWThreads: IntRange
|
||||
|
||||
startTime: TimeRange
|
||||
state: [JobState!]
|
||||
flopsAnyAvg: FloatRange
|
||||
memBwAvg: FloatRange
|
||||
loadAvg: FloatRange
|
||||
memUsedMax: FloatRange
|
||||
|
||||
exclusive: Int
|
||||
node: StringInput
|
||||
startTime: TimeRange
|
||||
state: [JobState!]
|
||||
metricStats: [MetricStatItem!]
|
||||
shared: String
|
||||
node: StringInput
|
||||
}
|
||||
|
||||
input OrderByInput {
|
||||
field: String!
|
||||
type: String!
|
||||
order: SortDirectionEnum! = ASC
|
||||
}
|
||||
|
||||
@@ -261,30 +453,46 @@ enum SortDirectionEnum {
|
||||
}
|
||||
|
||||
input StringInput {
|
||||
eq: String
|
||||
neq: String
|
||||
contains: String
|
||||
eq: String
|
||||
neq: String
|
||||
contains: String
|
||||
startsWith: String
|
||||
endsWith: String
|
||||
in: [String!]
|
||||
endsWith: String
|
||||
in: [String!]
|
||||
}
|
||||
|
||||
input IntRange { from: Int!, to: Int! }
|
||||
input FloatRange { from: Float!, to: Float! }
|
||||
input TimeRange { from: Time, to: Time }
|
||||
input IntRange {
|
||||
from: Int!
|
||||
to: Int!
|
||||
}
|
||||
input TimeRange {
|
||||
range: String
|
||||
from: Time
|
||||
to: Time
|
||||
}
|
||||
|
||||
input FloatRange {
|
||||
from: Float!
|
||||
to: Float!
|
||||
}
|
||||
|
||||
type NodeStateResultList {
|
||||
items: [Node!]!
|
||||
count: Int
|
||||
}
|
||||
|
||||
type JobResultList {
|
||||
items: [Job!]!
|
||||
items: [Job!]!
|
||||
offset: Int
|
||||
limit: Int
|
||||
count: Int
|
||||
limit: Int
|
||||
count: Int
|
||||
hasNextPage: Boolean
|
||||
}
|
||||
|
||||
type JobLinkResultList {
|
||||
listQuery: String
|
||||
items: [JobLink!]!
|
||||
count: Int
|
||||
items: [JobLink!]!
|
||||
count: Int
|
||||
}
|
||||
|
||||
type HistoPoint {
|
||||
@@ -295,6 +503,7 @@ type HistoPoint {
|
||||
type MetricHistoPoints {
|
||||
metric: String!
|
||||
unit: String!
|
||||
stat: String
|
||||
data: [MetricHistoPoint!]
|
||||
}
|
||||
|
||||
@@ -305,27 +514,28 @@ type MetricHistoPoint {
|
||||
max: Int
|
||||
}
|
||||
|
||||
type JobsStatistics {
|
||||
id: ID! # If `groupBy` was used, ID of the user/project/cluster
|
||||
name: String! # if User-Statistics: Given Name of Account (ID) Owner
|
||||
totalJobs: Int! # Number of jobs
|
||||
runningJobs: Int! # Number of running jobs
|
||||
shortJobs: Int! # Number of jobs with a duration of less than duration
|
||||
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
|
||||
totalNodes: Int! # Sum of the nodes of all matched jobs
|
||||
totalNodeHours: Int! # Sum of the node hours of all matched jobs
|
||||
totalCores: Int! # Sum of the cores of all matched jobs
|
||||
totalCoreHours: Int! # Sum of the core hours of all matched jobs
|
||||
totalAccs: Int! # Sum of the accs of all matched jobs
|
||||
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
|
||||
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
|
||||
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
|
||||
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
|
||||
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
|
||||
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
|
||||
type JobsStatistics {
|
||||
id: ID! # If `groupBy` was used, ID of the user/project/cluster/subcluster
|
||||
name: String! # if User-Statistics: Given Name of Account (ID) Owner
|
||||
totalUsers: Int! # if *not* User-Statistics: Number of active users (based on running jobs)
|
||||
totalJobs: Int! # Number of jobs
|
||||
runningJobs: Int! # Number of running jobs
|
||||
shortJobs: Int! # Number of jobs with a duration of less than config'd ShortRunningJobsDuration
|
||||
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
|
||||
totalNodes: Int! # Sum of the nodes of all matched jobs
|
||||
totalNodeHours: Int! # Sum of the node hours of all matched jobs
|
||||
totalCores: Int! # Sum of the cores of all matched jobs
|
||||
totalCoreHours: Int! # Sum of the core hours of all matched jobs
|
||||
totalAccs: Int! # Sum of the accs of all matched jobs
|
||||
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
|
||||
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
|
||||
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
|
||||
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
|
||||
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
|
||||
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
|
||||
}
|
||||
|
||||
input PageRequest {
|
||||
itemsPerPage: Int!
|
||||
page: Int!
|
||||
page: Int!
|
||||
}
|
||||
|
||||
844
api/swagger.json
844
api/swagger.json
File diff suppressed because it is too large
Load Diff
663
api/swagger.yaml
663
api/swagger.yaml
File diff suppressed because it is too large
Load Diff
35
cmd/cc-backend/cli.go
Normal file
35
cmd/cc-backend/cli.go
Normal file
@@ -0,0 +1,35 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import "flag"
|
||||
|
||||
var (
|
||||
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
|
||||
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags bool
|
||||
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
|
||||
)
|
||||
|
||||
func cliInit() {
|
||||
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize sqlite database file, config.json and .env")
|
||||
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
|
||||
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'hpc_user' table with ldap")
|
||||
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
|
||||
flag.BoolVar(&flagGops, "gops", false, "Listen via github.com/google/gops/agent (for debugging)")
|
||||
flag.BoolVar(&flagDev, "dev", false, "Enable development components: GraphQL Playground and Swagger UI")
|
||||
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
|
||||
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
|
||||
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
|
||||
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
|
||||
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
|
||||
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
|
||||
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
|
||||
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")
|
||||
flag.StringVar(&flagDelUser, "del-user", "", "Remove a existing user. Argument format: <username>")
|
||||
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
|
||||
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
|
||||
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info (default), warn, err, crit]`")
|
||||
flag.Parse()
|
||||
}
|
||||
95
cmd/cc-backend/init.go
Normal file
95
cmd/cc-backend/init.go
Normal file
@@ -0,0 +1,95 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
const envString = `
|
||||
# Base64 encoded Ed25519 keys (DO NOT USE THESE TWO IN PRODUCTION!)
|
||||
# You can generate your own keypair using the gen-keypair tool
|
||||
JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
|
||||
JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
|
||||
|
||||
# Some random bytes used as secret for cookie-based sessions (DO NOT USE THIS ONE IN PRODUCTION)
|
||||
SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
|
||||
`
|
||||
|
||||
const configString = `
|
||||
{
|
||||
"addr": "127.0.0.1:8080",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
],
|
||||
"enable-resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [
|
||||
600,
|
||||
300,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "name",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2023-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
`
|
||||
|
||||
func initEnv() {
|
||||
if util.CheckFileExists("var") {
|
||||
cclog.Exit("Directory ./var already exists. Cautiously exiting application initialization.")
|
||||
}
|
||||
|
||||
if err := os.WriteFile("config.json", []byte(configString), 0o666); err != nil {
|
||||
cclog.Abortf("Could not write default ./config.json with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
if err := os.WriteFile(".env", []byte(envString), 0o666); err != nil {
|
||||
cclog.Abortf("Could not write default ./.env file with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
if err := os.Mkdir("var", 0o777); err != nil {
|
||||
cclog.Abortf("Could not create default ./var folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
err := repository.MigrateDB("sqlite3", "./var/job.db")
|
||||
if err != nil {
|
||||
cclog.Abortf("Could not initialize default sqlite3 database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
}
|
||||
@@ -1,162 +1,59 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql/handler"
|
||||
"github.com/99designs/gqlgen/graphql/playground"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/runtimeEnv"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/util"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/tagger"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/taskManager"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/web"
|
||||
"github.com/go-co-op/gocron"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/runtimeEnv"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
"github.com/google/gops/agent"
|
||||
"github.com/gorilla/handlers"
|
||||
"github.com/gorilla/mux"
|
||||
httpSwagger "github.com/swaggo/http-swagger"
|
||||
"github.com/joho/godotenv"
|
||||
|
||||
_ "github.com/go-sql-driver/mysql"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
const logoString = `
|
||||
____ _ _ ____ _ _ _
|
||||
/ ___| |_ _ ___| |_ ___ _ __ / ___|___ ___| | ___ __ (_) |_
|
||||
_____ _ _ ____ _ _ _
|
||||
/ ___| |_ _ ___| |_ ___ _ __ / ___|___ ___| | ___ __ (_) |_
|
||||
| | | | | | / __| __/ _ \ '__| | / _ \ / __| |/ / '_ \| | __|
|
||||
| |___| | |_| \__ \ || __/ | | |__| (_) | (__| <| |_) | | |_
|
||||
\____|_|\__,_|___/\__\___|_| \____\___/ \___|_|\_\ .__/|_|\__|
|
||||
\_____|_|\__,_|___/\__\___|_| \____\___/ \___|_|\_\ .__/|_|\__|
|
||||
|_|
|
||||
`
|
||||
|
||||
const envString = `
|
||||
# Base64 encoded Ed25519 keys (DO NOT USE THESE TWO IN PRODUCTION!)
|
||||
# You can generate your own keypair using the gen-keypair tool
|
||||
JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
|
||||
JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
|
||||
|
||||
# Some random bytes used as secret for cookie-based sessions (DO NOT USE THIS ONE IN PRODUCTION)
|
||||
SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
|
||||
`
|
||||
|
||||
const configString = `
|
||||
{
|
||||
"addr": "127.0.0.1:8080",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "name",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2023-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
`
|
||||
|
||||
var (
|
||||
date string
|
||||
commit string
|
||||
version string
|
||||
)
|
||||
|
||||
func initEnv() {
|
||||
if util.CheckFileExists("var") {
|
||||
fmt.Print("Directory ./var already exists. Exiting!\n")
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
if err := os.WriteFile("config.json", []byte(configString), 0o666); err != nil {
|
||||
log.Fatalf("Writing config.json failed: %s", err.Error())
|
||||
}
|
||||
|
||||
if err := os.WriteFile(".env", []byte(envString), 0o666); err != nil {
|
||||
log.Fatalf("Writing .env failed: %s", err.Error())
|
||||
}
|
||||
|
||||
if err := os.Mkdir("var", 0o777); err != nil {
|
||||
log.Fatalf("Mkdir var failed: %s", err.Error())
|
||||
}
|
||||
|
||||
err := repository.MigrateDB("sqlite3", "./var/job.db")
|
||||
if err != nil {
|
||||
log.Fatalf("Initialize job.db failed: %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
var flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool
|
||||
var flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
|
||||
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize swlite database file, config.json and .env")
|
||||
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
|
||||
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'user' table with ldap")
|
||||
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
|
||||
flag.BoolVar(&flagGops, "gops", false, "Listen via github.com/google/gops/agent (for debugging)")
|
||||
flag.BoolVar(&flagDev, "dev", false, "Enable development components: GraphQL Playground and Swagger UI")
|
||||
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
|
||||
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
|
||||
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
|
||||
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
|
||||
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
|
||||
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
|
||||
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: `<username>:[admin,support,manager,api,user]:<password>`")
|
||||
flag.StringVar(&flagDelUser, "del-user", "", "Remove user by `username`")
|
||||
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
|
||||
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
|
||||
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug,info,warn (default),err,fatal,crit]`")
|
||||
flag.Parse()
|
||||
cliInit()
|
||||
|
||||
if flagVersion {
|
||||
fmt.Print(logoString)
|
||||
@@ -168,378 +65,208 @@ func main() {
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
// Apply config flags for pkg/log
|
||||
log.Init(flagLogLevel, flagLogDateTime)
|
||||
cclog.Init(flagLogLevel, flagLogDateTime)
|
||||
|
||||
// If init flag set, run tasks here before any file dependencies cause errors
|
||||
if flagInit {
|
||||
initEnv()
|
||||
fmt.Print("Succesfully setup environment!\n")
|
||||
fmt.Print("Please review config.json and .env and adjust it to your needs.\n")
|
||||
fmt.Print("Add your job-archive at ./var/job-archive.\n")
|
||||
os.Exit(0)
|
||||
cclog.Exit("Successfully setup environment!\n" +
|
||||
"Please review config.json and .env and adjust it to your needs.\n" +
|
||||
"Add your job-archive at ./var/job-archive.")
|
||||
}
|
||||
|
||||
// See https://github.com/google/gops (Runtime overhead is almost zero)
|
||||
if flagGops {
|
||||
if err := agent.Listen(agent.Options{}); err != nil {
|
||||
log.Fatalf("gops/agent.Listen failed: %s", err.Error())
|
||||
cclog.Abortf("Could not start gops agent with 'gops/agent.Listen(agent.Options{})'. Application startup failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
if err := runtimeEnv.LoadEnv("./.env"); err != nil && !os.IsNotExist(err) {
|
||||
log.Fatalf("parsing './.env' file failed: %s", err.Error())
|
||||
err := godotenv.Load()
|
||||
if err != nil {
|
||||
cclog.Abortf("Could not parse existing .env file at location './.env'. Application startup failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
// Initialize sub-modules and handle command line flags.
|
||||
// The order here is important!
|
||||
config.Init(flagConfigFile)
|
||||
ccconf.Init(flagConfigFile)
|
||||
|
||||
// As a special case for `db`, allow using an environment variable instead of the value
|
||||
// stored in the config. This can be done for people having security concerns about storing
|
||||
// the password for their mysql database in config.json.
|
||||
if strings.HasPrefix(config.Keys.DB, "env:") {
|
||||
envvar := strings.TrimPrefix(config.Keys.DB, "env:")
|
||||
config.Keys.DB = os.Getenv(envvar)
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
|
||||
if flagMigrateDB {
|
||||
err := repository.MigrateDB(config.Keys.DBDriver, config.Keys.DB)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
cclog.Abortf("MigrateDB Failed: Could not migrate '%s' database at location '%s' to version %d.\nError: %s\n", config.Keys.DBDriver, config.Keys.DB, repository.Version, err.Error())
|
||||
}
|
||||
os.Exit(0)
|
||||
cclog.Exitf("MigrateDB Success: Migrated '%s' database at location '%s' to version %d.\n", config.Keys.DBDriver, config.Keys.DB, repository.Version)
|
||||
}
|
||||
|
||||
if flagRevertDB {
|
||||
err := repository.RevertDB(config.Keys.DBDriver, config.Keys.DB)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
cclog.Abortf("RevertDB Failed: Could not revert '%s' database at location '%s' to version %d.\nError: %s\n", config.Keys.DBDriver, config.Keys.DB, (repository.Version - 1), err.Error())
|
||||
}
|
||||
os.Exit(0)
|
||||
cclog.Exitf("RevertDB Success: Reverted '%s' database at location '%s' to version %d.\n", config.Keys.DBDriver, config.Keys.DB, (repository.Version - 1))
|
||||
}
|
||||
|
||||
if flagForceDB {
|
||||
err := repository.ForceDB(config.Keys.DBDriver, config.Keys.DB)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
cclog.Abortf("ForceDB Failed: Could not force '%s' database at location '%s' to version %d.\nError: %s\n", config.Keys.DBDriver, config.Keys.DB, repository.Version, err.Error())
|
||||
}
|
||||
os.Exit(0)
|
||||
cclog.Exitf("ForceDB Success: Forced '%s' database at location '%s' to version %d.\n", config.Keys.DBDriver, config.Keys.DB, repository.Version)
|
||||
}
|
||||
|
||||
repository.Connect(config.Keys.DBDriver, config.Keys.DB)
|
||||
db := repository.GetConnection()
|
||||
|
||||
var authentication *auth.Authentication
|
||||
if !config.Keys.DisableAuthentication {
|
||||
var err error
|
||||
if authentication, err = auth.Init(); err != nil {
|
||||
log.Fatalf("auth initialization failed: %v", err)
|
||||
}
|
||||
|
||||
if d, err := time.ParseDuration(config.Keys.SessionMaxAge); err != nil {
|
||||
authentication.SessionMaxAge = d
|
||||
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||
auth.Init(&cfg)
|
||||
} else {
|
||||
cclog.Warn("Authentication disabled due to missing configuration")
|
||||
auth.Init(nil)
|
||||
}
|
||||
|
||||
if flagNewUser != "" {
|
||||
parts := strings.SplitN(flagNewUser, ":", 3)
|
||||
if len(parts) != 3 || len(parts[0]) == 0 {
|
||||
log.Fatal("invalid argument format for user creation")
|
||||
cclog.Abortf("Add User: Could not parse supplied argument format: No changes.\n"+
|
||||
"Want: <username>:[admin,support,manager,api,user]:<password>\n"+
|
||||
"Have: %s\n", flagNewUser)
|
||||
}
|
||||
|
||||
ur := repository.GetUserRepository()
|
||||
if err := ur.AddUser(&schema.User{
|
||||
Username: parts[0], Projects: make([]string, 0), Password: parts[2], Roles: strings.Split(parts[1], ","),
|
||||
}); err != nil {
|
||||
log.Fatalf("adding '%s' user authentication failed: %v", parts[0], err)
|
||||
cclog.Abortf("Add User: Could not add new user authentication for '%s' and roles '%s'.\nError: %s\n", parts[0], parts[1], err.Error())
|
||||
} else {
|
||||
cclog.Printf("Add User: Added new user '%s' with roles '%s'.\n", parts[0], parts[1])
|
||||
}
|
||||
}
|
||||
|
||||
if flagDelUser != "" {
|
||||
ur := repository.GetUserRepository()
|
||||
if err := ur.DelUser(flagDelUser); err != nil {
|
||||
log.Fatalf("deleting user failed: %v", err)
|
||||
cclog.Abortf("Delete User: Could not delete user '%s' from DB.\nError: %s\n", flagDelUser, err.Error())
|
||||
} else {
|
||||
cclog.Printf("Delete User: Deleted user '%s' from DB.\n", flagDelUser)
|
||||
}
|
||||
}
|
||||
|
||||
authHandle := auth.GetAuthInstance()
|
||||
|
||||
if flagSyncLDAP {
|
||||
if authentication.LdapAuth == nil {
|
||||
log.Fatal("cannot sync: LDAP authentication is not configured")
|
||||
if authHandle.LdapAuth == nil {
|
||||
cclog.Abort("Sync LDAP: LDAP authentication is not configured, could not synchronize. No changes, exited.")
|
||||
}
|
||||
|
||||
if err := authentication.LdapAuth.Sync(); err != nil {
|
||||
log.Fatalf("LDAP sync failed: %v", err)
|
||||
if err := authHandle.LdapAuth.Sync(); err != nil {
|
||||
cclog.Abortf("Sync LDAP: Could not synchronize, failed with error.\nError: %s\n", err.Error())
|
||||
}
|
||||
log.Info("LDAP sync successfull")
|
||||
cclog.Print("Sync LDAP: LDAP synchronization successfull.")
|
||||
}
|
||||
|
||||
if flagGenJWT != "" {
|
||||
ur := repository.GetUserRepository()
|
||||
user, err := ur.GetUser(flagGenJWT)
|
||||
if err != nil {
|
||||
log.Fatalf("could not get user from JWT: %v", err)
|
||||
cclog.Abortf("JWT: Could not get supplied user '%s' from DB. No changes, exited.\nError: %s\n", flagGenJWT, err.Error())
|
||||
}
|
||||
|
||||
if !user.HasRole(schema.RoleApi) {
|
||||
log.Warnf("user '%s' does not have the API role", user.Username)
|
||||
cclog.Warnf("JWT: User '%s' does not have the role 'api'. REST API endpoints will return error!\n", user.Username)
|
||||
}
|
||||
|
||||
jwt, err := authentication.JwtAuth.ProvideJWT(user)
|
||||
jwt, err := authHandle.JwtAuth.ProvideJWT(user)
|
||||
if err != nil {
|
||||
log.Fatalf("failed to provide JWT to user '%s': %v", user.Username, err)
|
||||
cclog.Abortf("JWT: User '%s' found in DB, but failed to provide JWT.\nError: %s\n", user.Username, err.Error())
|
||||
}
|
||||
|
||||
fmt.Printf("MAIN > JWT for '%s': %s\n", user.Username, jwt)
|
||||
cclog.Printf("JWT: Successfully generated JWT for user '%s': %s\n", user.Username, jwt)
|
||||
}
|
||||
|
||||
} else if flagNewUser != "" || flagDelUser != "" {
|
||||
log.Fatal("arguments --add-user and --del-user can only be used if authentication is enabled")
|
||||
cclog.Abort("Error: Arguments '--add-user' and '--del-user' can only be used if authentication is enabled. No changes, exited.")
|
||||
}
|
||||
|
||||
if err := archive.Init(config.Keys.Archive, config.Keys.DisableArchive); err != nil {
|
||||
log.Fatalf("failed to initialize archive: %s", err.Error())
|
||||
if archiveCfg := ccconf.GetPackageConfig("archive"); archiveCfg != nil {
|
||||
err = archive.Init(archiveCfg, config.Keys.DisableArchive)
|
||||
} else {
|
||||
err = archive.Init(json.RawMessage("{\"kind\":\"file\",\"path\":\"./var/job-archive\"}"), config.Keys.DisableArchive)
|
||||
}
|
||||
if err != nil {
|
||||
cclog.Abortf("Init: Failed to initialize archive.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
if err := metricdata.Init(config.Keys.DisableArchive); err != nil {
|
||||
log.Fatalf("failed to initialize metricdata repository: %s", err.Error())
|
||||
if err := metricdata.Init(); err != nil {
|
||||
cclog.Abortf("Init: Failed to initialize metricdata repository.\nError %s\n", err.Error())
|
||||
}
|
||||
|
||||
if flagReinitDB {
|
||||
if err := importer.InitDB(); err != nil {
|
||||
log.Fatalf("failed to re-initialize repository DB: %s", err.Error())
|
||||
cclog.Abortf("Init DB: Failed to re-initialize repository DB.\nError: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Print("Init DB: Sucessfully re-initialized repository DB.")
|
||||
}
|
||||
}
|
||||
|
||||
if flagImportJob != "" {
|
||||
if err := importer.HandleImportFlag(flagImportJob); err != nil {
|
||||
log.Fatalf("job import failed: %s", err.Error())
|
||||
cclog.Abortf("Import Job: Job import failed.\nError: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Printf("Import Job: Imported Job '%s' into DB.\n", flagImportJob)
|
||||
}
|
||||
}
|
||||
|
||||
if config.Keys.EnableJobTaggers {
|
||||
tagger.Init()
|
||||
}
|
||||
|
||||
if flagApplyTags {
|
||||
if err := tagger.RunTaggers(); err != nil {
|
||||
cclog.Abortf("Running job taggers.\nError: %s\n", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
if !flagServer {
|
||||
return
|
||||
cclog.Exit("No errors, server flag not set. Exiting cc-backend.")
|
||||
}
|
||||
|
||||
// Setup the http.Handler/Router used by the server
|
||||
jobRepo := repository.GetJobRepository()
|
||||
resolver := &graph.Resolver{DB: db.DB, Repo: jobRepo}
|
||||
graphQLEndpoint := handler.NewDefaultServer(generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
|
||||
if os.Getenv("DEBUG") != "1" {
|
||||
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
|
||||
// The problem with this is that then, no more stacktrace is printed to stderr.
|
||||
graphQLEndpoint.SetRecoverFunc(func(ctx context.Context, err interface{}) error {
|
||||
switch e := err.(type) {
|
||||
case string:
|
||||
return fmt.Errorf("MAIN > Panic: %s", e)
|
||||
case error:
|
||||
return fmt.Errorf("MAIN > Panic caused by: %w", e)
|
||||
}
|
||||
|
||||
return errors.New("MAIN > Internal server error (panic)")
|
||||
})
|
||||
}
|
||||
|
||||
api := &api.RestApi{
|
||||
JobRepository: jobRepo,
|
||||
Resolver: resolver,
|
||||
MachineStateDir: config.Keys.MachineStateDir,
|
||||
Authentication: authentication,
|
||||
}
|
||||
|
||||
r := mux.NewRouter()
|
||||
buildInfo := web.Build{Version: version, Hash: commit, Buildtime: date}
|
||||
|
||||
info := map[string]interface{}{}
|
||||
info["hasOpenIDConnect"] = false
|
||||
|
||||
if config.Keys.OpenIDConfig != nil {
|
||||
openIDConnect := auth.NewOIDC(authentication)
|
||||
openIDConnect.RegisterEndpoints(r)
|
||||
info["hasOpenIDConnect"] = true
|
||||
}
|
||||
|
||||
r.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
log.Debugf("##%v##", info)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
|
||||
}).Methods(http.MethodGet)
|
||||
r.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
|
||||
})
|
||||
r.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
|
||||
})
|
||||
|
||||
secured := r.PathPrefix("/").Subrouter()
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
r.Handle("/login", authentication.Login(
|
||||
// On success:
|
||||
http.RedirectHandler("/", http.StatusTemporaryRedirect),
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Login failed - ClusterCockpit",
|
||||
MsgType: "alert-warning",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
})).Methods(http.MethodPost)
|
||||
|
||||
r.Handle("/jwt-login", authentication.Login(
|
||||
// On success:
|
||||
http.RedirectHandler("/", http.StatusTemporaryRedirect),
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Login failed - ClusterCockpit",
|
||||
MsgType: "alert-warning",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))
|
||||
|
||||
r.Handle("/logout", authentication.Logout(
|
||||
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Bye - ClusterCockpit",
|
||||
MsgType: "alert-info",
|
||||
Message: "Logout successful",
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))).Methods(http.MethodPost)
|
||||
|
||||
secured.Use(func(next http.Handler) http.Handler {
|
||||
return authentication.Auth(
|
||||
// On success;
|
||||
next,
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Authentication failed - ClusterCockpit",
|
||||
MsgType: "alert-danger",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
if flagDev {
|
||||
r.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
|
||||
r.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
|
||||
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
|
||||
}
|
||||
secured.Handle("/query", graphQLEndpoint)
|
||||
|
||||
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
|
||||
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
|
||||
routerConfig.HandleSearchBar(rw, r, buildInfo)
|
||||
})
|
||||
|
||||
// Mount all /monitoring/... and /api/... routes.
|
||||
routerConfig.SetupRoutes(secured, buildInfo)
|
||||
api.MountRoutes(secured)
|
||||
|
||||
if config.Keys.EmbedStaticFiles {
|
||||
if i, err := os.Stat("./var/img"); err == nil {
|
||||
if i.IsDir() {
|
||||
log.Info("Use local directory for static images")
|
||||
r.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
|
||||
}
|
||||
}
|
||||
r.PathPrefix("/").Handler(web.ServeFiles())
|
||||
} else {
|
||||
r.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
|
||||
}
|
||||
|
||||
r.Use(handlers.CompressHandler)
|
||||
r.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
|
||||
r.Use(handlers.CORS(
|
||||
handlers.AllowCredentials(),
|
||||
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
|
||||
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
|
||||
handlers.AllowedOrigins([]string{"*"})))
|
||||
handler := handlers.CustomLoggingHandler(io.Discard, r, func(_ io.Writer, params handlers.LogFormatterParams) {
|
||||
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
|
||||
log.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
} else {
|
||||
log.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
}
|
||||
})
|
||||
|
||||
var wg sync.WaitGroup
|
||||
server := http.Server{
|
||||
ReadTimeout: 10 * time.Second,
|
||||
WriteTimeout: 10 * time.Second,
|
||||
Handler: handler,
|
||||
Addr: config.Keys.Addr,
|
||||
}
|
||||
|
||||
// Start http or https server
|
||||
listener, err := net.Listen("tcp", config.Keys.Addr)
|
||||
if err != nil {
|
||||
log.Fatalf("starting http listener failed: %v", err)
|
||||
}
|
||||
|
||||
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHttpTo != "" {
|
||||
go func() {
|
||||
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHttpTo, http.StatusMovedPermanently))
|
||||
}()
|
||||
}
|
||||
|
||||
if config.Keys.HttpsCertFile != "" && config.Keys.HttpsKeyFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(config.Keys.HttpsCertFile, config.Keys.HttpsKeyFile)
|
||||
if err != nil {
|
||||
log.Fatalf("loading X509 keypair failed: %v", err)
|
||||
// Metric Store starts after all flags have been processes
|
||||
if memorystore.InternalCCMSFlag {
|
||||
if mscfg := ccconf.GetPackageConfig("metric-store"); mscfg != nil {
|
||||
memorystore.Init(mscfg, &wg)
|
||||
} else {
|
||||
cclog.Abort("Metric Store configuration must be present")
|
||||
}
|
||||
listener = tls.NewListener(listener, &tls.Config{
|
||||
Certificates: []tls.Certificate{cert},
|
||||
CipherSuites: []uint16{
|
||||
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
|
||||
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
|
||||
},
|
||||
MinVersion: tls.VersionTLS12,
|
||||
PreferServerCipherSuites: true,
|
||||
})
|
||||
fmt.Printf("HTTPS server listening at %s...", config.Keys.Addr)
|
||||
} else {
|
||||
fmt.Printf("HTTP server listening at %s...", config.Keys.Addr)
|
||||
}
|
||||
archiver.Start(repository.GetJobRepository())
|
||||
|
||||
// Because this program will want to bind to a privileged port (like 80), the listener must
|
||||
// be established first, then the user can be changed, and after that,
|
||||
// the actual http server can be started.
|
||||
if err = runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
|
||||
log.Fatalf("error while preparing server start: %s", err.Error())
|
||||
}
|
||||
taskManager.Start(ccconf.GetPackageConfig("cron"),
|
||||
ccconf.GetPackageConfig("archive"))
|
||||
|
||||
cfg := ccconf.GetPackageConfig("ui")
|
||||
web.Init(cfg)
|
||||
|
||||
serverInit()
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
if err = server.Serve(listener); err != nil && err != http.ErrServerClosed {
|
||||
log.Fatalf("starting server failed: %v", err)
|
||||
}
|
||||
serverStart()
|
||||
}()
|
||||
|
||||
wg.Add(1)
|
||||
@@ -550,117 +277,17 @@ func main() {
|
||||
<-sigs
|
||||
runtimeEnv.SystemdNotifiy(false, "Shutting down ...")
|
||||
|
||||
// First shut down the server gracefully (waiting for all ongoing requests)
|
||||
server.Shutdown(context.Background())
|
||||
serverShutdown()
|
||||
|
||||
// Then, wait for any async archivings still pending...
|
||||
api.JobRepository.WaitForArchiving()
|
||||
util.FsWatcherShutdown()
|
||||
|
||||
taskManager.Shutdown()
|
||||
}()
|
||||
|
||||
s := gocron.NewScheduler(time.Local)
|
||||
|
||||
if config.Keys.StopJobsExceedingWalltime > 0 {
|
||||
log.Info("Register undead jobs service")
|
||||
|
||||
s.Every(1).Day().At("3:00").Do(func() {
|
||||
err = jobRepo.StopJobsExceedingWalltimeBy(config.Keys.StopJobsExceedingWalltime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for jobs exceeding their walltime: %s", err.Error())
|
||||
}
|
||||
runtime.GC()
|
||||
})
|
||||
}
|
||||
|
||||
var cfg struct {
|
||||
Retention schema.Retention `json:"retention"`
|
||||
Compression int `json:"compression"`
|
||||
}
|
||||
|
||||
cfg.Retention.IncludeDB = true
|
||||
|
||||
if err = json.Unmarshal(config.Keys.Archive, &cfg); err != nil {
|
||||
log.Warn("Error while unmarshaling raw config json")
|
||||
}
|
||||
|
||||
switch cfg.Retention.Policy {
|
||||
case "delete":
|
||||
log.Info("Register retention delete service")
|
||||
|
||||
s.Every(1).Day().At("4:00").Do(func() {
|
||||
startTime := time.Now().Unix() - int64(cfg.Retention.Age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().CleanUp(jobs)
|
||||
|
||||
if cfg.Retention.IncludeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime)
|
||||
if err != nil {
|
||||
log.Errorf("Error while deleting retention jobs from db: %s", err.Error())
|
||||
} else {
|
||||
log.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
log.Errorf("Error occured in db optimization: %s", err.Error())
|
||||
}
|
||||
}
|
||||
})
|
||||
case "move":
|
||||
log.Info("Register retention move service")
|
||||
|
||||
s.Every(1).Day().At("4:00").Do(func() {
|
||||
startTime := time.Now().Unix() - int64(cfg.Retention.Age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().Move(jobs, cfg.Retention.Location)
|
||||
|
||||
if cfg.Retention.IncludeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime)
|
||||
if err != nil {
|
||||
log.Errorf("Error while deleting retention jobs from db: %v", err)
|
||||
} else {
|
||||
log.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
log.Errorf("Error occured in db optimization: %v", err)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if cfg.Compression > 0 {
|
||||
log.Info("Register compression service")
|
||||
|
||||
s.Every(1).Day().At("5:00").Do(func() {
|
||||
var jobs []*schema.Job
|
||||
|
||||
ar := archive.GetHandle()
|
||||
startTime := time.Now().Unix() - int64(cfg.Compression*24*3600)
|
||||
lastTime := ar.CompressLast(startTime)
|
||||
if startTime == lastTime {
|
||||
log.Info("Compression Service - Complete archive run")
|
||||
jobs, err = jobRepo.FindJobsBetween(0, startTime)
|
||||
|
||||
} else {
|
||||
jobs, err = jobRepo.FindJobsBetween(lastTime, startTime)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for compression jobs: %v", err)
|
||||
}
|
||||
ar.Compress(jobs)
|
||||
})
|
||||
}
|
||||
|
||||
s.StartAsync()
|
||||
|
||||
if os.Getenv("GOGC") == "" {
|
||||
debug.SetGCPercent(25)
|
||||
}
|
||||
runtimeEnv.SystemdNotifiy(true, "running")
|
||||
wg.Wait()
|
||||
log.Print("Gracefull shutdown completed!")
|
||||
cclog.Print("Graceful shutdown completed!")
|
||||
}
|
||||
|
||||
378
cmd/cc-backend/server.go
Normal file
378
cmd/cc-backend/server.go
Normal file
@@ -0,0 +1,378 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql/handler"
|
||||
"github.com/99designs/gqlgen/graphql/handler/transport"
|
||||
"github.com/99designs/gqlgen/graphql/playground"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
|
||||
"github.com/ClusterCockpit/cc-backend/web"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/runtimeEnv"
|
||||
"github.com/gorilla/handlers"
|
||||
"github.com/gorilla/mux"
|
||||
httpSwagger "github.com/swaggo/http-swagger"
|
||||
)
|
||||
|
||||
var (
|
||||
router *mux.Router
|
||||
server *http.Server
|
||||
apiHandle *api.RestApi
|
||||
)
|
||||
|
||||
func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
json.NewEncoder(rw).Encode(map[string]string{
|
||||
"status": http.StatusText(http.StatusUnauthorized),
|
||||
"error": err.Error(),
|
||||
})
|
||||
}
|
||||
|
||||
func serverInit() {
|
||||
// Setup the http.Handler/Router used by the server
|
||||
graph.Init()
|
||||
resolver := graph.GetResolverInstance()
|
||||
graphQLServer := handler.New(
|
||||
generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
|
||||
|
||||
// graphQLServer.AddTransport(transport.SSE{})
|
||||
graphQLServer.AddTransport(transport.POST{})
|
||||
// graphQLServer.AddTransport(transport.Websocket{
|
||||
// KeepAlivePingInterval: 10 * time.Second,
|
||||
// Upgrader: websocket.Upgrader{
|
||||
// CheckOrigin: func(r *http.Request) bool {
|
||||
// return true
|
||||
// },
|
||||
// },
|
||||
// })
|
||||
|
||||
if os.Getenv("DEBUG") != "1" {
|
||||
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
|
||||
// The problem with this is that then, no more stacktrace is printed to stderr.
|
||||
graphQLServer.SetRecoverFunc(func(ctx context.Context, err any) error {
|
||||
switch e := err.(type) {
|
||||
case string:
|
||||
return fmt.Errorf("MAIN > Panic: %s", e)
|
||||
case error:
|
||||
return fmt.Errorf("MAIN > Panic caused by: %s", e.Error())
|
||||
}
|
||||
|
||||
return errors.New("MAIN > Internal server error (panic)")
|
||||
})
|
||||
}
|
||||
|
||||
authHandle := auth.GetAuthInstance()
|
||||
|
||||
apiHandle = api.New()
|
||||
|
||||
router = mux.NewRouter()
|
||||
buildInfo := web.Build{Version: version, Hash: commit, Buildtime: date}
|
||||
|
||||
info := map[string]any{}
|
||||
info["hasOpenIDConnect"] = false
|
||||
|
||||
if auth.Keys.OpenIDConfig != nil {
|
||||
openIDConnect := auth.NewOIDC(authHandle)
|
||||
openIDConnect.RegisterEndpoints(router)
|
||||
info["hasOpenIDConnect"] = true
|
||||
}
|
||||
|
||||
router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
cclog.Debugf("##%v##", info)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
|
||||
}).Methods(http.MethodGet)
|
||||
router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
|
||||
})
|
||||
router.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
|
||||
})
|
||||
|
||||
secured := router.PathPrefix("/").Subrouter()
|
||||
securedapi := router.PathPrefix("/api").Subrouter()
|
||||
userapi := router.PathPrefix("/userapi").Subrouter()
|
||||
configapi := router.PathPrefix("/config").Subrouter()
|
||||
frontendapi := router.PathPrefix("/frontend").Subrouter()
|
||||
metricstoreapi := router.PathPrefix("/metricstore").Subrouter()
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
router.Handle("/login", authHandle.Login(
|
||||
// On success: Handled within Login()
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Login failed - ClusterCockpit",
|
||||
MsgType: "alert-warning",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
})).Methods(http.MethodPost)
|
||||
|
||||
router.Handle("/jwt-login", authHandle.Login(
|
||||
// On success: Handled within Login()
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Login failed - ClusterCockpit",
|
||||
MsgType: "alert-warning",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))
|
||||
|
||||
router.Handle("/logout", authHandle.Logout(
|
||||
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Bye - ClusterCockpit",
|
||||
MsgType: "alert-info",
|
||||
Message: "Logout successful",
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))).Methods(http.MethodPost)
|
||||
|
||||
secured.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.Auth(
|
||||
// On success;
|
||||
next,
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Authentication failed - ClusterCockpit",
|
||||
MsgType: "alert-danger",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
Redirect: r.RequestURI,
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
securedapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
userapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthUserAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
metricstoreapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthMetricStoreAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
configapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthConfigAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
frontendapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthFrontendAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
}
|
||||
|
||||
if flagDev {
|
||||
router.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
|
||||
router.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
|
||||
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
|
||||
}
|
||||
secured.Handle("/query", graphQLServer)
|
||||
|
||||
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
|
||||
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
|
||||
routerConfig.HandleSearchBar(rw, r, buildInfo)
|
||||
})
|
||||
|
||||
// Mount all /monitoring/... and /api/... routes.
|
||||
routerConfig.SetupRoutes(secured, buildInfo)
|
||||
apiHandle.MountApiRoutes(securedapi)
|
||||
apiHandle.MountUserApiRoutes(userapi)
|
||||
apiHandle.MountConfigApiRoutes(configapi)
|
||||
apiHandle.MountFrontendApiRoutes(frontendapi)
|
||||
|
||||
if memorystore.InternalCCMSFlag {
|
||||
apiHandle.MountMetricStoreApiRoutes(metricstoreapi)
|
||||
}
|
||||
|
||||
if config.Keys.EmbedStaticFiles {
|
||||
if i, err := os.Stat("./var/img"); err == nil {
|
||||
if i.IsDir() {
|
||||
cclog.Info("Use local directory for static images")
|
||||
router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
|
||||
}
|
||||
}
|
||||
router.PathPrefix("/").Handler(http.StripPrefix("/", web.ServeFiles()))
|
||||
} else {
|
||||
router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
|
||||
}
|
||||
|
||||
router.Use(handlers.CompressHandler)
|
||||
router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
|
||||
router.Use(handlers.CORS(
|
||||
handlers.AllowCredentials(),
|
||||
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
|
||||
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
|
||||
handlers.AllowedOrigins([]string{"*"})))
|
||||
|
||||
// secured.NotFoundHandler = http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
// page := web.Page{
|
||||
// Title: "ClusterCockpit - Not Found",
|
||||
// Build: buildInfo,
|
||||
// }
|
||||
// rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
// web.RenderTemplate(rw, "404.tmpl", &page)
|
||||
// })
|
||||
|
||||
// secured.NotFoundHandler = http.HandlerFunc(http.NotFound)
|
||||
// router.NotFoundHandler = router.NewRoute().HandlerFunc(http.NotFound).GetHandler()
|
||||
|
||||
// printEndpoints(router)
|
||||
}
|
||||
|
||||
// func printEndpoints(r *mux.Router) {
|
||||
// r.Walk(func(route *mux.Route, router *mux.Router, ancestors []*mux.Route) error {
|
||||
// path, err := route.GetPathTemplate()
|
||||
// if err != nil {
|
||||
// path = "nopath"
|
||||
// }
|
||||
// methods, err := route.GetMethods()
|
||||
// if err != nil {
|
||||
// methods = append(methods, "nomethod")
|
||||
// }
|
||||
// fmt.Printf("%v %s\n", methods, path)
|
||||
// return nil
|
||||
// })
|
||||
// }
|
||||
|
||||
func serverStart() {
|
||||
handler := handlers.CustomLoggingHandler(io.Discard, router, func(_ io.Writer, params handlers.LogFormatterParams) {
|
||||
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
|
||||
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
} else {
|
||||
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
}
|
||||
})
|
||||
|
||||
server = &http.Server{
|
||||
ReadTimeout: 20 * time.Second,
|
||||
WriteTimeout: 20 * time.Second,
|
||||
Handler: handler,
|
||||
Addr: config.Keys.Addr,
|
||||
}
|
||||
|
||||
// Start http or https server
|
||||
listener, err := net.Listen("tcp", config.Keys.Addr)
|
||||
if err != nil {
|
||||
cclog.Abortf("Server Start: Starting http listener on '%s' failed.\nError: %s\n", config.Keys.Addr, err.Error())
|
||||
}
|
||||
|
||||
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHTTPTo != "" {
|
||||
go func() {
|
||||
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently))
|
||||
}()
|
||||
}
|
||||
|
||||
if config.Keys.HTTPSCertFile != "" && config.Keys.HTTPSKeyFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(
|
||||
config.Keys.HTTPSCertFile, config.Keys.HTTPSKeyFile)
|
||||
if err != nil {
|
||||
cclog.Abortf("Server Start: Loading X509 keypair failed. Check options 'https-cert-file' and 'https-key-file' in 'config.json'.\nError: %s\n", err.Error())
|
||||
}
|
||||
listener = tls.NewListener(listener, &tls.Config{
|
||||
Certificates: []tls.Certificate{cert},
|
||||
CipherSuites: []uint16{
|
||||
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
|
||||
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
|
||||
},
|
||||
MinVersion: tls.VersionTLS12,
|
||||
PreferServerCipherSuites: true,
|
||||
})
|
||||
cclog.Printf("HTTPS server listening at %s...\n", config.Keys.Addr)
|
||||
} else {
|
||||
cclog.Printf("HTTP server listening at %s...\n", config.Keys.Addr)
|
||||
}
|
||||
//
|
||||
// Because this program will want to bind to a privileged port (like 80), the listener must
|
||||
// be established first, then the user can be changed, and after that,
|
||||
// the actual http server can be started.
|
||||
if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
|
||||
cclog.Abortf("Server Start: Error while preparing server start.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
if err = server.Serve(listener); err != nil && err != http.ErrServerClosed {
|
||||
cclog.Abortf("Server Start: Starting server failed.\nError: %s\n", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func serverShutdown() {
|
||||
// First shut down the server gracefully (waiting for all ongoing requests)
|
||||
server.Shutdown(context.Background())
|
||||
|
||||
// Archive all the metric store data
|
||||
if memorystore.InternalCCMSFlag {
|
||||
memorystore.Shutdown()
|
||||
}
|
||||
|
||||
// Then, wait for any async archivings still pending...
|
||||
archiver.WaitForArchiving()
|
||||
}
|
||||
@@ -1,56 +1,93 @@
|
||||
{
|
||||
"main": {
|
||||
"addr": "127.0.0.1:8080",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
"short-running-jobs-duration": 300,
|
||||
"resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [
|
||||
600,
|
||||
300,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
],
|
||||
"emission-constant": 317
|
||||
},
|
||||
"cron": {
|
||||
"commit-job-worker": "2m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"max-age": "2000h"
|
||||
}
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store-internal",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store-internal",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metric-store": {
|
||||
"checkpoints": {
|
||||
"file-format": "avro",
|
||||
"interval": "1h",
|
||||
"directory": "./var/checkpoints",
|
||||
"restore": "48h"
|
||||
},
|
||||
"archive": {
|
||||
"interval": "1h",
|
||||
"directory": "./var/archive"
|
||||
},
|
||||
"retention-in-memory": "48h"
|
||||
},
|
||||
"ui-file": "./configs/uiConfig.json"
|
||||
}
|
||||
64
configs/config-mariadb.json
Normal file
64
configs/config-mariadb.json
Normal file
@@ -0,0 +1,64 @@
|
||||
{
|
||||
"addr": "127.0.0.1:8080",
|
||||
"short-running-jobs-duration": 300,
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"db-driver": "mysql",
|
||||
"db": "clustercockpit:demo@tcp(127.0.0.1:3306)/clustercockpit",
|
||||
"enable-resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [600, 300, 120, 60]
|
||||
},
|
||||
"emission-constant": 317,
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,50 +1,50 @@
|
||||
{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:443",
|
||||
"ldap": {
|
||||
"url": "ldaps://test",
|
||||
"user_base": "ou=people,ou=hpc,dc=test,dc=de",
|
||||
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=test,dc=de",
|
||||
"user_bind": "uid={username},ou=people,ou=hpc,dc=test,dc=de",
|
||||
"user_filter": "(&(objectclass=posixAccount))"
|
||||
},
|
||||
"https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem",
|
||||
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
|
||||
"user": "clustercockpit",
|
||||
"group": "clustercockpit",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"validate": true,
|
||||
"clusters": [
|
||||
{
|
||||
"name": "test",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "eyJhbGciOiJF-E-pQBQ"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"validate": false,
|
||||
"apiAllowedIPs": ["*"],
|
||||
"short-running-jobs-duration": 300,
|
||||
"resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [600, 300, 120, 60]
|
||||
}
|
||||
},
|
||||
"cron": {
|
||||
"commit-job-worker": "2m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "test",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "eyJhbGciOiJF-E-pQBQ"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
],
|
||||
"jwts": {
|
||||
"cookieName": "",
|
||||
"validateUser": false,
|
||||
"max-age": "2000h",
|
||||
"trustedIssuer": ""
|
||||
},
|
||||
"short-running-jobs-duration": 300
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@@ -117,10 +117,12 @@ foreach my $ln (split("\n", $topo)) {
|
||||
|
||||
my $node;
|
||||
my @sockets;
|
||||
my @nodeCores;
|
||||
foreach my $socket ( @{$DOMAINS{socket}} ) {
|
||||
push @sockets, "[".join(",", @{$socket})."]";
|
||||
$node .= join(",", @{$socket})
|
||||
push @nodeCores, join(",", @{$socket});
|
||||
}
|
||||
$node = join(",", @nodeCores);
|
||||
$INFO{sockets} = join(",\n", @sockets);
|
||||
|
||||
my @memDomains;
|
||||
@@ -212,9 +214,27 @@ print <<"END";
|
||||
"socketsPerNode": $INFO{socketsPerNode},
|
||||
"coresPerSocket": $INFO{coresPerSocket},
|
||||
"threadsPerCore": $INFO{threadsPerCore},
|
||||
"flopRateScalar": $flopsScalar,
|
||||
"flopRateSimd": $flopsSimd,
|
||||
"memoryBandwidth": $memBw,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": $flopsScalar
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": $flopsSimd
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": $memBw
|
||||
},
|
||||
"nodes": "<FILL IN NODE RANGES>",
|
||||
"topology": {
|
||||
"node": [$node],
|
||||
|
||||
45
configs/uiConfig.json
Normal file
45
configs/uiConfig.json
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"jobList": {
|
||||
"usePaging": false,
|
||||
"showFootprint":false
|
||||
},
|
||||
"jobView": {
|
||||
"showPolarPlot": true,
|
||||
"showFootprint": true,
|
||||
"showRoofline": true,
|
||||
"showStatTable": true
|
||||
},
|
||||
"metricConfig": {
|
||||
"jobListMetrics": ["mem_bw", "flops_dp"],
|
||||
"jobViewPlotMetrics": ["mem_bw", "flops_dp"],
|
||||
"jobViewTableMetrics": ["mem_bw", "flops_dp"],
|
||||
"clusters": [
|
||||
{
|
||||
"name": "test",
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "one",
|
||||
"jobListMetrics": ["mem_used", "flops_sp"]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"nodeList": {
|
||||
"usePaging": true
|
||||
},
|
||||
"plotConfiguration": {
|
||||
"plotsPerRow": 3,
|
||||
"colorBackground": true,
|
||||
"lineWidth": 3,
|
||||
"colorScheme": [
|
||||
"#00bfff",
|
||||
"#0000ff",
|
||||
"#ff00ff",
|
||||
"#ff0000",
|
||||
"#ff8000",
|
||||
"#ffff00",
|
||||
"#80ff00"
|
||||
]
|
||||
}
|
||||
}
|
||||
143
go.mod
143
go.mod
@@ -1,92 +1,103 @@
|
||||
module github.com/ClusterCockpit/cc-backend
|
||||
|
||||
go 1.18
|
||||
go 1.24.0
|
||||
|
||||
toolchain go1.24.1
|
||||
|
||||
require (
|
||||
github.com/99designs/gqlgen v0.17.45
|
||||
github.com/ClusterCockpit/cc-units v0.4.0
|
||||
github.com/Masterminds/squirrel v1.5.3
|
||||
github.com/coreos/go-oidc/v3 v3.9.0
|
||||
github.com/go-co-op/gocron v1.25.0
|
||||
github.com/go-ldap/ldap/v3 v3.4.4
|
||||
github.com/go-sql-driver/mysql v1.7.0
|
||||
github.com/golang-jwt/jwt/v5 v5.2.1
|
||||
github.com/golang-migrate/migrate/v4 v4.15.2
|
||||
github.com/google/gops v0.3.27
|
||||
github.com/gorilla/handlers v1.5.1
|
||||
github.com/gorilla/mux v1.8.0
|
||||
github.com/gorilla/sessions v1.2.1
|
||||
github.com/influxdata/influxdb-client-go/v2 v2.12.2
|
||||
github.com/jmoiron/sqlx v1.3.5
|
||||
github.com/mattn/go-sqlite3 v1.14.16
|
||||
github.com/prometheus/client_golang v1.14.0
|
||||
github.com/prometheus/common v0.40.0
|
||||
github.com/99designs/gqlgen v0.17.81
|
||||
github.com/ClusterCockpit/cc-lib v0.10.1
|
||||
github.com/Masterminds/squirrel v1.5.4
|
||||
github.com/coreos/go-oidc/v3 v3.16.0
|
||||
github.com/expr-lang/expr v1.17.6
|
||||
github.com/go-co-op/gocron/v2 v2.16.0
|
||||
github.com/go-ldap/ldap/v3 v3.4.10
|
||||
github.com/go-sql-driver/mysql v1.9.0
|
||||
github.com/golang-jwt/jwt/v5 v5.2.2
|
||||
github.com/golang-migrate/migrate/v4 v4.18.2
|
||||
github.com/google/gops v0.3.28
|
||||
github.com/gorilla/handlers v1.5.2
|
||||
github.com/gorilla/mux v1.8.1
|
||||
github.com/gorilla/sessions v1.4.0
|
||||
github.com/influxdata/line-protocol/v2 v2.2.1
|
||||
github.com/jmoiron/sqlx v1.4.0
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/linkedin/goavro/v2 v2.14.0
|
||||
github.com/mattn/go-sqlite3 v1.14.24
|
||||
github.com/nats-io/nats.go v1.47.0
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/prometheus/common v0.66.1
|
||||
github.com/qustavo/sqlhooks/v2 v2.1.0
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.2.0
|
||||
github.com/swaggo/http-swagger v1.3.3
|
||||
github.com/swaggo/swag v1.16.3
|
||||
github.com/vektah/gqlparser/v2 v2.5.11
|
||||
golang.org/x/crypto v0.21.0
|
||||
golang.org/x/exp v0.0.0-20230510235704-dd950f8aeaea
|
||||
golang.org/x/oauth2 v0.13.0
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
|
||||
github.com/swaggo/http-swagger v1.3.4
|
||||
github.com/swaggo/swag v1.16.6
|
||||
github.com/vektah/gqlparser/v2 v2.5.30
|
||||
golang.org/x/crypto v0.43.0
|
||||
golang.org/x/oauth2 v0.32.0
|
||||
golang.org/x/time v0.13.0
|
||||
)
|
||||
|
||||
require (
|
||||
filippo.io/edwards25519 v1.1.0 // indirect
|
||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
|
||||
github.com/KyleBanks/depth v1.2.1 // indirect
|
||||
github.com/agnivade/levenshtein v1.1.1 // indirect
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||
github.com/agnivade/levenshtein v1.2.1 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.2.0 // indirect
|
||||
github.com/containerd/containerd v1.6.26 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
|
||||
github.com/deepmap/oapi-codegen v1.12.4 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.3 // indirect
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.4 // indirect
|
||||
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.21.0 // indirect
|
||||
github.com/go-openapi/jsonreference v0.21.0 // indirect
|
||||
github.com/go-openapi/spec v0.21.0 // indirect
|
||||
github.com/go-openapi/swag v0.23.0 // indirect
|
||||
github.com/golang/protobuf v1.5.3 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/fsnotify/fsnotify v1.9.0 // indirect
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect
|
||||
github.com/go-jose/go-jose/v4 v4.1.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.22.1 // indirect
|
||||
github.com/go-openapi/jsonreference v0.21.2 // indirect
|
||||
github.com/go-openapi/spec v0.22.0 // indirect
|
||||
github.com/go-openapi/swag/conv v0.25.1 // indirect
|
||||
github.com/go-openapi/swag/jsonname v0.25.1 // indirect
|
||||
github.com/go-openapi/swag/jsonutils v0.25.1 // indirect
|
||||
github.com/go-openapi/swag/loading v0.25.1 // indirect
|
||||
github.com/go-openapi/swag/stringutils v0.25.1 // indirect
|
||||
github.com/go-openapi/swag/typeutils v0.25.1 // indirect
|
||||
github.com/go-openapi/swag/yamlutils v0.25.1 // indirect
|
||||
github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
|
||||
github.com/golang/snappy v0.0.4 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/gorilla/securecookie v1.1.1 // indirect
|
||||
github.com/gorilla/websocket v1.5.0 // indirect
|
||||
github.com/gorilla/securecookie v1.1.2 // indirect
|
||||
github.com/gorilla/websocket v1.5.3 // indirect
|
||||
github.com/hashicorp/errwrap v1.1.0 // indirect
|
||||
github.com/hashicorp/go-multierror v1.1.1 // indirect
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
|
||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/jonboulle/clockwork v0.5.0 // indirect
|
||||
github.com/jpillora/backoff v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/compress v1.18.0 // indirect
|
||||
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
|
||||
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
|
||||
github.com/opencontainers/image-spec v1.1.0-rc2.0.20221005185240-3a7f492d3f1b // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/prometheus/client_model v0.3.0 // indirect
|
||||
github.com/prometheus/procfs v0.9.0 // indirect
|
||||
github.com/nats-io/nkeys v0.4.11 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/procfs v0.16.1 // indirect
|
||||
github.com/robfig/cron/v3 v3.0.1 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/sosodev/duration v1.2.0 // indirect
|
||||
github.com/swaggo/files v1.0.0 // indirect
|
||||
github.com/urfave/cli/v2 v2.27.1 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913 // indirect
|
||||
go.uber.org/atomic v1.10.0 // indirect
|
||||
golang.org/x/mod v0.16.0 // indirect
|
||||
golang.org/x/net v0.22.0 // indirect
|
||||
golang.org/x/sys v0.18.0 // indirect
|
||||
golang.org/x/text v0.14.0 // indirect
|
||||
golang.org/x/tools v0.19.0 // indirect
|
||||
google.golang.org/appengine v1.6.8 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 // indirect
|
||||
google.golang.org/protobuf v1.33.0 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
github.com/sosodev/duration v1.3.1 // indirect
|
||||
github.com/swaggo/files v1.0.1 // indirect
|
||||
github.com/urfave/cli/v2 v2.27.7 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
|
||||
go.uber.org/atomic v1.11.0 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.3 // indirect
|
||||
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
||||
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect
|
||||
golang.org/x/mod v0.29.0 // indirect
|
||||
golang.org/x/net v0.46.0 // indirect
|
||||
golang.org/x/sync v0.17.0 // indirect
|
||||
golang.org/x/sys v0.37.0 // indirect
|
||||
golang.org/x/text v0.30.0 // indirect
|
||||
golang.org/x/tools v0.38.0 // indirect
|
||||
google.golang.org/protobuf v1.36.9 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
sigs.k8s.io/yaml v1.4.0 // indirect
|
||||
sigs.k8s.io/yaml v1.6.0 // indirect
|
||||
)
|
||||
|
||||
63
gqlgen.yml
63
gqlgen.yml
@@ -30,7 +30,9 @@ resolver:
|
||||
# gqlgen will search for any type names in the schema in these go packages
|
||||
# if they match it will use them, otherwise it will generate them.
|
||||
autobind:
|
||||
- "github.com/99designs/gqlgen/graphql/introspection"
|
||||
- "github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
- "github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
|
||||
# This section declares type mapping between the GraphQL and go type systems
|
||||
#
|
||||
@@ -50,34 +52,51 @@ models:
|
||||
- github.com/99designs/gqlgen/graphql.Int64
|
||||
- github.com/99designs/gqlgen/graphql.Int32
|
||||
Job:
|
||||
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Job"
|
||||
model: "github.com/ClusterCockpit/cc-lib/schema.Job"
|
||||
fields:
|
||||
tags:
|
||||
resolver: true
|
||||
metaData:
|
||||
resolver: true
|
||||
Cluster:
|
||||
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Cluster"
|
||||
model: "github.com/ClusterCockpit/cc-lib/schema.Cluster"
|
||||
fields:
|
||||
partitions:
|
||||
resolver: true
|
||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
|
||||
MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
|
||||
MetricValue: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
|
||||
JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
|
||||
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
|
||||
Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
|
||||
JobState: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
|
||||
TimeRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
|
||||
IntRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.IntRange" }
|
||||
JobMetric: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobMetric" }
|
||||
Series: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Series" }
|
||||
MetricStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricStatistics" }
|
||||
MetricConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricConfig" }
|
||||
SubClusterConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubClusterConfig" }
|
||||
Accelerator: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Accelerator" }
|
||||
Topology: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Topology" }
|
||||
FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
|
||||
SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
|
||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
|
||||
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }
|
||||
# Node:
|
||||
# model: "github.com/ClusterCockpit/cc-lib/schema.Node"
|
||||
# fields:
|
||||
# metaData:
|
||||
# resolver: true
|
||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/schema.Float" }
|
||||
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricScope" }
|
||||
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricValue" }
|
||||
JobStatistics:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.JobStatistics" }
|
||||
GlobalMetricListItem:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.GlobalMetricListItem" }
|
||||
ClusterSupport:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.ClusterSupport" }
|
||||
Tag: { model: "github.com/ClusterCockpit/cc-lib/schema.Tag" }
|
||||
Resource: { model: "github.com/ClusterCockpit/cc-lib/schema.Resource" }
|
||||
JobState: { model: "github.com/ClusterCockpit/cc-lib/schema.JobState" }
|
||||
Node: { model: "github.com/ClusterCockpit/cc-lib/schema.Node" }
|
||||
SchedulerState:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.SchedulerState" }
|
||||
HealthState:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.MonitoringState" }
|
||||
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/schema.JobMetric" }
|
||||
Series: { model: "github.com/ClusterCockpit/cc-lib/schema.Series" }
|
||||
MetricStatistics:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricStatistics" }
|
||||
MetricConfig:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricConfig" }
|
||||
SubClusterConfig:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.SubClusterConfig" }
|
||||
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/schema.Accelerator" }
|
||||
Topology: { model: "github.com/ClusterCockpit/cc-lib/schema.Topology" }
|
||||
FilterRanges:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.FilterRanges" }
|
||||
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/schema.SubCluster" }
|
||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/schema.StatsSeries" }
|
||||
Unit: { model: "github.com/ClusterCockpit/cc-lib/schema.Unit" }
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[Unit]
|
||||
Description=ClusterCockpit Web Server (Go edition)
|
||||
Description=ClusterCockpit Web Server
|
||||
Documentation=https://github.com/ClusterCockpit/cc-backend
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package api_test
|
||||
@@ -14,18 +14,22 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
@@ -33,15 +37,22 @@ import (
|
||||
|
||||
func setup(t *testing.T) *api.RestApi {
|
||||
const testconfig = `{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
}
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "testcluster",
|
||||
@@ -110,22 +121,22 @@ func setup(t *testing.T) *api.RestApi {
|
||||
]
|
||||
}`
|
||||
|
||||
log.Init("info", true)
|
||||
cclog.Init("info", true)
|
||||
tmpdir := t.TempDir()
|
||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||
if err := os.Mkdir(jobarchive, 0777); err != nil {
|
||||
if err := os.Mkdir(jobarchive, 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 2), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0777); err != nil {
|
||||
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJson), 0666); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJson), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -136,31 +147,46 @@ func setup(t *testing.T) *api.RestApi {
|
||||
}
|
||||
|
||||
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0666); err != nil {
|
||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
config.Init(cfgFilePath)
|
||||
ccconf.Init(cfgFilePath)
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
repository.Connect("sqlite3", dbfilepath)
|
||||
db := repository.GetConnection()
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := metricdata.Init(config.Keys.DisableArchive); err != nil {
|
||||
if err := metricdata.Init(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
jobRepo := repository.GetJobRepository()
|
||||
resolver := &graph.Resolver{DB: db.DB, Repo: jobRepo}
|
||||
archiver.Start(repository.GetJobRepository())
|
||||
|
||||
return &api.RestApi{
|
||||
JobRepository: resolver.Repo,
|
||||
Resolver: resolver,
|
||||
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||
auth.Init(&cfg)
|
||||
} else {
|
||||
cclog.Warn("Authentication disabled due to missing configuration")
|
||||
auth.Init(nil)
|
||||
}
|
||||
|
||||
graph.Init()
|
||||
|
||||
return api.New()
|
||||
}
|
||||
|
||||
func cleanup() {
|
||||
@@ -175,7 +201,6 @@ func cleanup() {
|
||||
func TestRestApi(t *testing.T) {
|
||||
restapi := setup(t)
|
||||
t.Cleanup(cleanup)
|
||||
|
||||
testData := schema.JobData{
|
||||
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||
schema.MetricScopeNode: {
|
||||
@@ -192,15 +217,21 @@ func TestRestApi(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
return testData, nil
|
||||
}
|
||||
|
||||
r := mux.NewRouter()
|
||||
restapi.MountRoutes(r)
|
||||
r.PathPrefix("/api").Subrouter()
|
||||
r.StrictSlash(true)
|
||||
restapi.MountApiRoutes(r)
|
||||
|
||||
var TestJobId int64 = 123
|
||||
TestClusterName := "testcluster"
|
||||
var TestStartTime int64 = 123456789
|
||||
|
||||
const startJobBody string = `{
|
||||
"jobId": 123,
|
||||
"jobId": 123,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
@@ -210,10 +241,9 @@ func TestRestApi(t *testing.T) {
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"exclusive": 1,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"tags": [{ "type": "testTagType", "name": "testTagName" }],
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
@@ -224,31 +254,37 @@ func TestRestApi(t *testing.T) {
|
||||
"startTime": 123456789
|
||||
}`
|
||||
|
||||
var dbid int64
|
||||
const contextUserKey repository.ContextKey = "user"
|
||||
contextUserValue := &schema.User{
|
||||
Username: "testuser",
|
||||
Projects: make([]string, 0),
|
||||
Roles: []string{"user"},
|
||||
AuthType: 0,
|
||||
AuthSource: 2,
|
||||
}
|
||||
|
||||
if ok := t.Run("StartJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusCreated {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
var res api.StartJobApiResponse
|
||||
if err := json.Unmarshal(recorder.Body.Bytes(), &res); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(res.DBID)))
|
||||
// resolver := graph.GetResolverInstance()
|
||||
restapi.JobRepository.SyncJobs()
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
job.Tags, err = restapi.Resolver.Job().Tags(context.Background(), job)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// job.Tags, err = resolver.Job().Tags(ctx, job)
|
||||
// if err != nil {
|
||||
// t.Fatal(err)
|
||||
// }
|
||||
|
||||
if job.JobID != 123 ||
|
||||
job.User != "testuser" ||
|
||||
@@ -257,23 +293,20 @@ func TestRestApi(t *testing.T) {
|
||||
job.SubCluster != "sc1" ||
|
||||
job.Partition != "default" ||
|
||||
job.Walltime != 3600 ||
|
||||
job.ArrayJobId != 0 ||
|
||||
job.ArrayJobID != 0 ||
|
||||
job.NumNodes != 1 ||
|
||||
job.NumHWThreads != 8 ||
|
||||
job.NumAcc != 0 ||
|
||||
job.Exclusive != 1 ||
|
||||
job.MonitoringStatus != 1 ||
|
||||
job.SMT != 1 ||
|
||||
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
|
||||
job.StartTime.Unix() != 123456789 {
|
||||
job.StartTime != 123456789 {
|
||||
t.Fatalf("unexpected job properties: %#v", job)
|
||||
}
|
||||
|
||||
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" {
|
||||
t.Fatalf("unexpected tags: %#v", job.Tags)
|
||||
}
|
||||
|
||||
dbid = res.DBID
|
||||
// if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" {
|
||||
// t.Fatalf("unexpected tags: %#v", job.Tags)
|
||||
// }
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
@@ -289,17 +322,19 @@ func TestRestApi(t *testing.T) {
|
||||
|
||||
var stoppedJob *schema.Job
|
||||
if ok := t.Run("StopJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusOK {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
restapi.JobRepository.WaitForArchiving()
|
||||
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(dbid)))
|
||||
archiver.WaitForArchiving()
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -327,7 +362,7 @@ func TestRestApi(t *testing.T) {
|
||||
}
|
||||
|
||||
t.Run("CheckArchive", func(t *testing.T) {
|
||||
data, err := metricdata.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background())
|
||||
data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -339,12 +374,14 @@ func TestRestApi(t *testing.T) {
|
||||
|
||||
t.Run("CheckDoubleStart", func(t *testing.T) {
|
||||
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!
|
||||
body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1)
|
||||
body := strings.ReplaceAll(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`)
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(body)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusUnprocessableEntity {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
@@ -359,7 +396,7 @@ func TestRestApi(t *testing.T) {
|
||||
"partition": "default",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"exclusive": 1,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
@@ -371,10 +408,12 @@ func TestRestApi(t *testing.T) {
|
||||
}`
|
||||
|
||||
ok := t.Run("StartJobFailed", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusCreated {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
@@ -384,8 +423,11 @@ func TestRestApi(t *testing.T) {
|
||||
t.Fatal("subtest failed")
|
||||
}
|
||||
|
||||
time.Sleep(1 * time.Second)
|
||||
restapi.JobRepository.SyncJobs()
|
||||
|
||||
const stopJobBodyFailed string = `{
|
||||
"jobId": 12345,
|
||||
"jobId": 12345,
|
||||
"cluster": "testcluster",
|
||||
|
||||
"jobState": "failed",
|
||||
@@ -393,16 +435,18 @@ func TestRestApi(t *testing.T) {
|
||||
}`
|
||||
|
||||
ok = t.Run("StopJobFailed", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusOK {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
restapi.JobRepository.WaitForArchiving()
|
||||
archiver.WaitForArchiving()
|
||||
jobid, cluster := int64(12345), "testcluster"
|
||||
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
|
||||
if err != nil {
|
||||
|
||||
71
internal/api/cluster.go
Normal file
71
internal/api/cluster.go
Normal file
@@ -0,0 +1,71 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// GetClustersAPIResponse model
|
||||
type GetClustersAPIResponse struct {
|
||||
Clusters []*schema.Cluster `json:"clusters"` // Array of clusters
|
||||
}
|
||||
|
||||
// getClusters godoc
|
||||
// @summary Lists all cluster configs
|
||||
// @tags Cluster query
|
||||
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
|
||||
// @produce json
|
||||
// @param cluster query string false "Job Cluster"
|
||||
// @success 200 {object} api.GetClustersApiResponse "Array of clusters"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/clusters/ [get]
|
||||
func (api *RestApi) getClusters(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
bw := bufio.NewWriter(rw)
|
||||
defer bw.Flush()
|
||||
|
||||
var clusters []*schema.Cluster
|
||||
|
||||
if r.URL.Query().Has("cluster") {
|
||||
name := r.URL.Query().Get("cluster")
|
||||
cluster := archive.GetCluster(name)
|
||||
if cluster == nil {
|
||||
handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
clusters = append(clusters, cluster)
|
||||
} else {
|
||||
clusters = archive.Clusters
|
||||
}
|
||||
|
||||
payload := GetClustersAPIResponse{
|
||||
Clusters: clusters,
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(bw).Encode(payload); err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
984
internal/api/job.go
Normal file
984
internal/api/job.go
Normal file
@@ -0,0 +1,984 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package api
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// StopJobApiRequest model
|
||||
type StopJobApiRequest struct {
|
||||
JobId *int64 `json:"jobId" example:"123000"`
|
||||
Cluster *string `json:"cluster" example:"fritz"`
|
||||
StartTime *int64 `json:"startTime" example:"1649723812"`
|
||||
State schema.JobState `json:"jobState" validate:"required" example:"completed"`
|
||||
StopTime int64 `json:"stopTime" validate:"required" example:"1649763839"`
|
||||
}
|
||||
|
||||
// DeleteJobApiRequest model
|
||||
type DeleteJobApiRequest struct {
|
||||
JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job
|
||||
Cluster *string `json:"cluster" example:"fritz"` // Cluster of job
|
||||
StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch
|
||||
}
|
||||
|
||||
// GetJobsApiResponse model
|
||||
type GetJobsApiResponse struct {
|
||||
Jobs []*schema.Job `json:"jobs"` // Array of jobs
|
||||
Items int `json:"items"` // Number of jobs returned
|
||||
Page int `json:"page"` // Page id returned
|
||||
}
|
||||
|
||||
// ApiTag model
|
||||
type ApiTag struct {
|
||||
// Tag Type
|
||||
Type string `json:"type" example:"Debug"`
|
||||
Name string `json:"name" example:"Testjob"` // Tag Name
|
||||
Scope string `json:"scope" example:"global"` // Tag Scope for Frontend Display
|
||||
}
|
||||
|
||||
// ApiMeta model
|
||||
type EditMetaRequest struct {
|
||||
Key string `json:"key" example:"jobScript"`
|
||||
Value string `json:"value" example:"bash script"`
|
||||
}
|
||||
|
||||
type TagJobApiRequest []*ApiTag
|
||||
|
||||
type GetJobApiRequest []string
|
||||
|
||||
type GetJobApiResponse struct {
|
||||
Meta *schema.Job
|
||||
Data []*JobMetricWithName
|
||||
}
|
||||
|
||||
type GetCompleteJobApiResponse struct {
|
||||
Meta *schema.Job
|
||||
Data schema.JobData
|
||||
}
|
||||
|
||||
type JobMetricWithName struct {
|
||||
Metric *schema.JobMetric `json:"metric"`
|
||||
Name string `json:"name"`
|
||||
Scope schema.MetricScope `json:"scope"`
|
||||
}
|
||||
|
||||
// getJobs godoc
|
||||
// @summary Lists all jobs
|
||||
// @tags Job query
|
||||
// @description Get a list of all jobs. Filters can be applied using query parameters.
|
||||
// @description Number of results can be limited by page. Results are sorted by descending startTime.
|
||||
// @produce json
|
||||
// @param state query string false "Job State" Enums(running, completed, failed, cancelled, stopped, timeout)
|
||||
// @param cluster query string false "Job Cluster"
|
||||
// @param start-time query string false "Syntax: '$from-$to', as unix epoch timestamps in seconds"
|
||||
// @param items-per-page query int false "Items per page (Default: 25)"
|
||||
// @param page query int false "Page Number (Default: 1)"
|
||||
// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response"
|
||||
// @success 200 {object} api.GetJobsApiResponse "Job array and page info"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/ [get]
|
||||
func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
withMetadata := false
|
||||
filter := &model.JobFilter{}
|
||||
page := &model.PageRequest{ItemsPerPage: 25, Page: 1}
|
||||
order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc}
|
||||
|
||||
for key, vals := range r.URL.Query() {
|
||||
switch key {
|
||||
// TODO: add project filter
|
||||
case "state":
|
||||
for _, s := range vals {
|
||||
state := schema.JobState(s)
|
||||
if !state.Valid() {
|
||||
handleError(fmt.Errorf("invalid query parameter value: state"),
|
||||
http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
filter.State = append(filter.State, state)
|
||||
}
|
||||
case "cluster":
|
||||
filter.Cluster = &model.StringInput{Eq: &vals[0]}
|
||||
case "start-time": // ?startTime=1753707480-1754053139
|
||||
st := strings.Split(vals[0], "-")
|
||||
if len(st) != 2 {
|
||||
handleError(fmt.Errorf("invalid query parameter value: startTime"),
|
||||
http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
from, err := strconv.ParseInt(st[0], 10, 64)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
to, err := strconv.ParseInt(st[1], 10, 64)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
ufrom, uto := time.Unix(from, 0), time.Unix(to, 0)
|
||||
filter.StartTime = &config.TimeRange{From: &ufrom, To: &uto}
|
||||
case "page":
|
||||
x, err := strconv.Atoi(vals[0])
|
||||
if err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
page.Page = x
|
||||
case "items-per-page":
|
||||
x, err := strconv.Atoi(vals[0])
|
||||
if err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
page.ItemsPerPage = x
|
||||
case "with-metadata":
|
||||
withMetadata = true
|
||||
default:
|
||||
handleError(fmt.Errorf("invalid query parameter: %s", key),
|
||||
http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
jobs, err := api.JobRepository.QueryJobs(r.Context(), []*model.JobFilter{filter}, page, order)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
results := make([]*schema.Job, 0, len(jobs))
|
||||
for _, job := range jobs {
|
||||
if withMetadata {
|
||||
if _, err = api.JobRepository.FetchMetadata(job); err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if job.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful {
|
||||
job.Statistics, err = archive.GetStatistics(job)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
results = append(results, job)
|
||||
}
|
||||
|
||||
cclog.Debugf("/api/jobs: %d jobs returned", len(results))
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
bw := bufio.NewWriter(rw)
|
||||
defer bw.Flush()
|
||||
|
||||
payload := GetJobsApiResponse{
|
||||
Jobs: results,
|
||||
Items: page.ItemsPerPage,
|
||||
Page: page.Page,
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(bw).Encode(payload); err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// getCompleteJobById godoc
|
||||
// @summary Get job meta and optional all metric data
|
||||
// @tags Job query
|
||||
// @description Job to get is specified by database ID
|
||||
// @description Returns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @param all-metrics query bool false "Include all available metrics"
|
||||
// @success 200 {object} api.GetJobApiResponse "Job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} api.ErrorResponse "Resource not found"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/{id} [get]
|
||||
func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
// Fetch job from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if ok {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID
|
||||
} else {
|
||||
handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
|
||||
}
|
||||
if _, err = api.JobRepository.FetchMetadata(job); err != nil {
|
||||
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
var scopes []schema.MetricScope
|
||||
|
||||
if job.NumNodes == 1 {
|
||||
scopes = []schema.MetricScope{"core"}
|
||||
} else {
|
||||
scopes = []schema.MetricScope{"node"}
|
||||
}
|
||||
|
||||
var data schema.JobData
|
||||
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
resolution := 0
|
||||
|
||||
for _, mc := range metricConfigs {
|
||||
resolution = max(resolution, mc.Timestep)
|
||||
}
|
||||
|
||||
if r.URL.Query().Get("all-metrics") == "true" {
|
||||
data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution)
|
||||
if err != nil {
|
||||
cclog.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Debugf("/api/job/%s: get job %d", id, job.JobID)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
bw := bufio.NewWriter(rw)
|
||||
defer bw.Flush()
|
||||
|
||||
payload := GetCompleteJobApiResponse{
|
||||
Meta: job,
|
||||
Data: data,
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(bw).Encode(payload); err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// getJobById godoc
|
||||
// @summary Get job meta and configurable metric data
|
||||
// @tags Job query
|
||||
// @description Job to get is specified by database ID
|
||||
// @description Returns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @param request body api.GetJobApiRequest true "Array of metric names"
|
||||
// @success 200 {object} api.GetJobApiResponse "Job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} api.ErrorResponse "Resource not found"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/{id} [post]
|
||||
func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
// Fetch job from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if ok {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(r.Context(), id)
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
|
||||
}
|
||||
if _, err = api.JobRepository.FetchMetadata(job); err != nil {
|
||||
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
var metrics GetJobApiRequest
|
||||
if err = decode(r.Body, &metrics); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
var scopes []schema.MetricScope
|
||||
|
||||
if job.NumNodes == 1 {
|
||||
scopes = []schema.MetricScope{"core"}
|
||||
} else {
|
||||
scopes = []schema.MetricScope{"node"}
|
||||
}
|
||||
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
resolution := 0
|
||||
|
||||
for _, mc := range metricConfigs {
|
||||
resolution = max(resolution, mc.Timestep)
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution)
|
||||
if err != nil {
|
||||
cclog.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster)
|
||||
return
|
||||
}
|
||||
|
||||
res := []*JobMetricWithName{}
|
||||
for name, md := range data {
|
||||
for scope, metric := range md {
|
||||
res = append(res, &JobMetricWithName{
|
||||
Name: name,
|
||||
Scope: scope,
|
||||
Metric: metric,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Debugf("/api/job/%s: get job %d", id, job.JobID)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
bw := bufio.NewWriter(rw)
|
||||
defer bw.Flush()
|
||||
|
||||
payload := GetJobApiResponse{
|
||||
Meta: job,
|
||||
Data: res,
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(bw).Encode(payload); err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// editMeta godoc
|
||||
// @summary Edit meta-data json
|
||||
// @tags Job add and modify
|
||||
// @description Edit key value pairs in job metadata json
|
||||
// @description If a key already exists its content will be overwritten
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Job Database ID"
|
||||
// @param request body api.EditMetaRequest true "Kay value pair to add"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 404 {object} api.ErrorResponse "Job does not exist"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/edit_meta/{id} [post]
|
||||
func (api *RestApi) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
var req EditMetaRequest
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if err := api.JobRepository.UpdateMetadata(job, req.Key, req.Value); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(job)
|
||||
}
|
||||
|
||||
// tagJob godoc
|
||||
// @summary Adds one or more tags to a job
|
||||
// @tags Job add and modify
|
||||
// @description Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.
|
||||
// @description Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username.
|
||||
// @description If tagged job is already finished: Tag will be written directly to respective archive files.
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Job Database ID"
|
||||
// @param request body api.TagJobApiRequest true "Array of tag-objects to add"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 404 {object} api.ErrorResponse "Job or tag does not exist"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/tag_job/{id} [post]
|
||||
func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
var req TagJobApiRequest
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
for _, tag := range req {
|
||||
tagId, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), *job.ID, tag.Type, tag.Name, tag.Scope)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags = append(job.Tags, &schema.Tag{
|
||||
ID: tagId,
|
||||
Type: tag.Type,
|
||||
Name: tag.Name,
|
||||
Scope: tag.Scope,
|
||||
})
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(job)
|
||||
}
|
||||
|
||||
// removeTagJob godoc
|
||||
// @summary Removes one or more tags from a job
|
||||
// @tags Job add and modify
|
||||
// @description Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match.
|
||||
// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API.
|
||||
// @description If tagged job is already finished: Tag will be removed from respective archive files.
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Job Database ID"
|
||||
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 404 {object} api.ErrorResponse "Job or tag does not exist"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/tag_job/{id} [delete]
|
||||
func (api *RestApi) removeTagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
var req TagJobApiRequest
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
for _, rtag := range req {
|
||||
// Only Global and Admin Tags
|
||||
if rtag.Scope != "global" && rtag.Scope != "admin" {
|
||||
cclog.Warnf("Cannot delete private tag for job %d: Skip", job.JobID)
|
||||
continue
|
||||
}
|
||||
|
||||
remainingTags, err := api.JobRepository.RemoveJobTagByRequest(repository.GetUserFromContext(r.Context()), *job.ID, rtag.Type, rtag.Name, rtag.Scope)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags = remainingTags
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(job)
|
||||
}
|
||||
|
||||
// removeTags godoc
|
||||
// @summary Removes all tags and job-relations for type:name tuple
|
||||
// @tags Tag remove
|
||||
// @description Removes tags by type and name. Name and Type of Tag(s) must match.
|
||||
// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API.
|
||||
// @description Tag wills be removed from respective archive files.
|
||||
// @accept json
|
||||
// @produce plain
|
||||
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
|
||||
// @success 200 {string} string "Success Response"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 404 {object} api.ErrorResponse "Job or tag does not exist"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /tags/ [delete]
|
||||
func (api *RestApi) removeTags(rw http.ResponseWriter, r *http.Request) {
|
||||
var req TagJobApiRequest
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
targetCount := len(req)
|
||||
currentCount := 0
|
||||
for _, rtag := range req {
|
||||
// Only Global and Admin Tags
|
||||
if rtag.Scope != "global" && rtag.Scope != "admin" {
|
||||
cclog.Warn("Cannot delete private tag: Skip")
|
||||
continue
|
||||
}
|
||||
|
||||
err := api.JobRepository.RemoveTagByRequest(rtag.Type, rtag.Name, rtag.Scope)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
} else {
|
||||
currentCount++
|
||||
}
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
fmt.Fprintf(rw, "Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount)
|
||||
}
|
||||
|
||||
// startJob godoc
|
||||
// @summary Adds a new job as "running"
|
||||
// @tags Job add and modify
|
||||
// @description Job specified in request body will be saved to database as "running" with new DB ID.
|
||||
// @description Job specifications follow the 'Job' scheme, API will fail to execute if requirements are not met.
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body schema.Job true "Job to add"
|
||||
// @success 201 {object} api.DefaultApiResponse "Job added successfully"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: The combination of jobId, clusterId and startTime does already exist"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/start_job/ [post]
|
||||
func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
req := schema.Job{
|
||||
Shared: "none",
|
||||
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
|
||||
}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Printf("REST: %s\n", req.GoString())
|
||||
req.State = schema.JobStateRunning
|
||||
|
||||
if err := importer.SanityChecks(&req); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// aquire lock to avoid race condition between API calls
|
||||
var unlockOnce sync.Once
|
||||
api.RepositoryMutex.Lock()
|
||||
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||
|
||||
// Check if combination of (job_id, cluster_id, start_time) already exists:
|
||||
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
handleError(fmt.Errorf("checking for duplicate failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
} else if err == nil {
|
||||
for _, job := range jobs {
|
||||
if (req.StartTime - job.StartTime) < 86400 {
|
||||
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
id, err := api.JobRepository.Start(&req)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
// unlock here, adding Tags can be async
|
||||
unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||
|
||||
for _, tag := range req.Tags {
|
||||
if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusCreated)
|
||||
json.NewEncoder(rw).Encode(DefaultApiResponse{
|
||||
Message: "success",
|
||||
})
|
||||
}
|
||||
|
||||
// stopJobByRequest godoc
|
||||
// @summary Marks job as completed and triggers archiving
|
||||
// @tags Job add and modify
|
||||
// @description Job to stop is specified by request body. All fields are required in this case.
|
||||
// @description Returns full job resource information according to 'Job' scheme.
|
||||
// @produce json
|
||||
// @param request body api.StopJobApiRequest true "All fields required"
|
||||
// @success 200 {object} schema.Job "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} api.ErrorResponse "Resource not found"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: job has already been stopped"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/stop_job/ [post]
|
||||
func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// Parse request body
|
||||
req := StopJobApiRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch job (that will be stopped) from db
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if req.JobId == nil {
|
||||
handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req)
|
||||
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
job, err = api.JobRepository.FindCached(req.JobId, req.Cluster, req.StartTime)
|
||||
// FIXME: Previous error is hidden
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
api.checkAndHandleStopJob(rw, job, req)
|
||||
}
|
||||
|
||||
// deleteJobById godoc
|
||||
// @summary Remove a job from the sql database
|
||||
// @tags Job remove
|
||||
// @description Job to remove is specified by database ID. This will not remove the job from the job archive.
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} api.ErrorResponse "Resource not found"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/delete_job/{id} [delete]
|
||||
func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
var err error
|
||||
if ok {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
err = api.JobRepository.DeleteJobById(id)
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(DefaultApiResponse{
|
||||
Message: fmt.Sprintf("Successfully deleted job %s", id),
|
||||
})
|
||||
}
|
||||
|
||||
// deleteJobByRequest godoc
|
||||
// @summary Remove a job from the sql database
|
||||
// @tags Job remove
|
||||
// @description Job to delete is specified by request body. All fields are required in this case.
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body api.DeleteJobApiRequest true "All fields required"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} api.ErrorResponse "Resource not found"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/delete_job/ [delete]
|
||||
func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// Parse request body
|
||||
req := DeleteJobApiRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch job (that will be deleted) from db
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if req.JobId == nil {
|
||||
handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
err = api.JobRepository.DeleteJobById(*job.ID)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(DefaultApiResponse{
|
||||
Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
|
||||
})
|
||||
}
|
||||
|
||||
// deleteJobBefore godoc
|
||||
// @summary Remove a job from the sql database
|
||||
// @tags Job remove
|
||||
// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.
|
||||
// @produce json
|
||||
// @param ts path int true "Unix epoch timestamp"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} api.ErrorResponse "Resource not found"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/delete_job_before/{ts} [delete]
|
||||
func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
||||
var cnt int
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["ts"]
|
||||
var err error
|
||||
if ok {
|
||||
ts, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
cnt, err = api.JobRepository.DeleteJobsBefore(ts)
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'ts' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("deleting jobs failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(DefaultApiResponse{
|
||||
Message: fmt.Sprintf("Successfully deleted %d jobs", cnt),
|
||||
})
|
||||
}
|
||||
|
||||
func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobApiRequest) {
|
||||
// Sanity checks
|
||||
if job.State != schema.JobStateRunning {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if job == nil || job.StartTime > req.StopTime {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if req.State != "" && !req.State.Valid() {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
|
||||
return
|
||||
} else if req.State == "" {
|
||||
req.State = schema.JobStateCompleted
|
||||
}
|
||||
|
||||
// Mark job as stopped in the database (update state and duration)
|
||||
job.Duration = int32(req.StopTime - job.StartTime)
|
||||
job.State = req.State
|
||||
api.JobRepository.Mutex.Lock()
|
||||
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
api.JobRepository.Mutex.Unlock()
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
api.JobRepository.Mutex.Unlock()
|
||||
|
||||
cclog.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||
|
||||
// Send a response (with status OK). This means that errors that happen from here on forward
|
||||
// can *NOT* be communicated to the client. If reading from a MetricDataRepository or
|
||||
// writing to the filesystem fails, the client will not know.
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(job)
|
||||
|
||||
// Monitoring is disabled...
|
||||
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
|
||||
return
|
||||
}
|
||||
|
||||
// Trigger async archiving
|
||||
archiver.TriggerArchiving(job)
|
||||
}
|
||||
|
||||
func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
id := mux.Vars(r)["id"]
|
||||
metrics := r.URL.Query()["metric"]
|
||||
var scopes []schema.MetricScope
|
||||
for _, scope := range r.URL.Query()["scope"] {
|
||||
var s schema.MetricScope
|
||||
if err := s.UnmarshalGQL(scope); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
scopes = append(scopes, s)
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
|
||||
type Respone struct {
|
||||
Data *struct {
|
||||
JobMetrics []*model.JobMetricWithName `json:"jobMetrics"`
|
||||
} `json:"data"`
|
||||
Error *struct {
|
||||
Message string `json:"message"`
|
||||
} `json:"error"`
|
||||
}
|
||||
|
||||
resolver := graph.GetResolverInstance()
|
||||
data, err := resolver.Query().JobMetrics(r.Context(), id, metrics, scopes, nil)
|
||||
if err != nil {
|
||||
json.NewEncoder(rw).Encode(Respone{
|
||||
Error: &struct {
|
||||
Message string "json:\"message\""
|
||||
}{Message: err.Error()},
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
json.NewEncoder(rw).Encode(Respone{
|
||||
Data: &struct {
|
||||
JobMetrics []*model.JobMetricWithName "json:\"jobMetrics\""
|
||||
}{JobMetrics: data},
|
||||
})
|
||||
}
|
||||
177
internal/api/memorystore.go
Normal file
177
internal/api/memorystore.go
Normal file
@@ -0,0 +1,177 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
|
||||
"github.com/influxdata/line-protocol/v2/lineprotocol"
|
||||
)
|
||||
|
||||
// handleFree godoc
|
||||
// @summary
|
||||
// @tags free
|
||||
// @description This endpoint allows the users to free the Buffers from the
|
||||
// metric store. This endpoint offers the users to remove then systematically
|
||||
// and also allows then to prune the data under node, if they do not want to
|
||||
// remove the whole node.
|
||||
// @produce json
|
||||
// @param to query string false "up to timestamp"
|
||||
// @success 200 {string} string "ok"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /free/ [post]
|
||||
func freeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
rawTo := r.URL.Query().Get("to")
|
||||
if rawTo == "" {
|
||||
handleError(errors.New("'to' is a required query parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
to, err := strconv.ParseInt(rawTo, 10, 64)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// // TODO: lastCheckpoint might be modified by different go-routines.
|
||||
// // Load it using the sync/atomic package?
|
||||
// freeUpTo := lastCheckpoint.Unix()
|
||||
// if to < freeUpTo {
|
||||
// freeUpTo = to
|
||||
// }
|
||||
|
||||
bodyDec := json.NewDecoder(r.Body)
|
||||
var selectors [][]string
|
||||
err = bodyDec.Decode(&selectors)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
n := 0
|
||||
for _, sel := range selectors {
|
||||
bn, err := ms.Free(sel, to)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
n += bn
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
fmt.Fprintf(rw, "buffers freed: %d\n", n)
|
||||
}
|
||||
|
||||
// handleWrite godoc
|
||||
// @summary Receive metrics in InfluxDB line-protocol
|
||||
// @tags write
|
||||
// @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)
|
||||
|
||||
// @accept plain
|
||||
// @produce json
|
||||
// @param cluster query string false "If the lines in the body do not have a cluster tag, use this value instead."
|
||||
// @success 200 {string} string "ok"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /write/ [post]
|
||||
func writeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
bytes, err := io.ReadAll(r.Body)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
dec := lineprotocol.NewDecoderWithBytes(bytes)
|
||||
if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
|
||||
cclog.Errorf("/api/write error: %s", err.Error())
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
}
|
||||
|
||||
// handleDebug godoc
|
||||
// @summary Debug endpoint
|
||||
// @tags debug
|
||||
// @description This endpoint allows the users to print the content of
|
||||
// nodes/clusters/metrics to review the state of the data.
|
||||
// @produce json
|
||||
// @param selector query string false "Selector"
|
||||
// @success 200 {string} string "Debug dump"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /debug/ [post]
|
||||
func debugMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
raw := r.URL.Query().Get("selector")
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
selector := []string{}
|
||||
if len(raw) != 0 {
|
||||
selector = strings.Split(raw, ":")
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// handleHealthCheck godoc
|
||||
// @summary HealthCheck endpoint
|
||||
// @tags healthcheck
|
||||
// @description This endpoint allows the users to check if a node is healthy
|
||||
// @produce json
|
||||
// @param selector query string false "Selector"
|
||||
// @success 200 {string} string "Debug dump"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /healthcheck/ [get]
|
||||
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
|
||||
rawCluster := r.URL.Query().Get("cluster")
|
||||
rawNode := r.URL.Query().Get("node")
|
||||
|
||||
if rawCluster == "" || rawNode == "" {
|
||||
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
|
||||
selector := []string{rawCluster, rawNode}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
80
internal/api/node.go
Normal file
80
internal/api/node.go
Normal file
@@ -0,0 +1,80 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
type UpdateNodeStatesRequest struct {
|
||||
Nodes []schema.NodePayload `json:"nodes"`
|
||||
Cluster string `json:"cluster" example:"fritz"`
|
||||
}
|
||||
|
||||
// this routine assumes that only one of them exists per node
|
||||
func determineState(states []string) schema.SchedulerState {
|
||||
for _, state := range states {
|
||||
switch strings.ToLower(state) {
|
||||
case "allocated":
|
||||
return schema.NodeStateAllocated
|
||||
case "reserved":
|
||||
return schema.NodeStateReserved
|
||||
case "idle":
|
||||
return schema.NodeStateIdle
|
||||
case "down":
|
||||
return schema.NodeStateDown
|
||||
case "mixed":
|
||||
return schema.NodeStateMixed
|
||||
}
|
||||
}
|
||||
|
||||
return schema.NodeStateUnknown
|
||||
}
|
||||
|
||||
// updateNodeStates godoc
|
||||
// @summary Deliver updated Slurm node states
|
||||
// @tags Nodestates
|
||||
// @description Returns a JSON-encoded list of users.
|
||||
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||
// @produce json
|
||||
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/nodestats/ [post]
|
||||
func (api *RestApi) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
||||
// Parse request body
|
||||
req := UpdateNodeStatesRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err),
|
||||
http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
repo := repository.GetNodeRepository()
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: time.Now().Unix(), NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
GpusAllocated: node.GpusAllocated,
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
JobsRunning: node.JobsRunning,
|
||||
}
|
||||
|
||||
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
|
||||
}
|
||||
}
|
||||
1378
internal/api/rest.go
1378
internal/api/rest.go
File diff suppressed because it is too large
Load Diff
159
internal/api/user.go
Normal file
159
internal/api/user.go
Normal file
@@ -0,0 +1,159 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package api
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
type ApiReturnedUser struct {
|
||||
Username string `json:"username"`
|
||||
Name string `json:"name"`
|
||||
Roles []string `json:"roles"`
|
||||
Email string `json:"email"`
|
||||
Projects []string `json:"projects"`
|
||||
}
|
||||
|
||||
// getUsers godoc
|
||||
// @summary Returns a list of users
|
||||
// @tags User
|
||||
// @description Returns a JSON-encoded list of users.
|
||||
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||
// @produce json
|
||||
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
|
||||
// @success 200 {array} api.ApiReturnedUser "List of users returned successfully"
|
||||
// @failure 400 {string} string "Bad Request"
|
||||
// @failure 401 {string} string "Unauthorized"
|
||||
// @failure 403 {string} string "Forbidden"
|
||||
// @failure 500 {string} string "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/users/ [get]
|
||||
func (api *RestApi) getUsers(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
http.Error(rw, "Only admins are allowed to fetch a list of users", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true")
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
json.NewEncoder(rw).Encode(users)
|
||||
}
|
||||
|
||||
func (api *RestApi) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
http.Error(rw, "Only admins are allowed to update a user", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
// Get Values
|
||||
newrole := r.FormValue("add-role")
|
||||
delrole := r.FormValue("remove-role")
|
||||
newproj := r.FormValue("add-project")
|
||||
delproj := r.FormValue("remove-project")
|
||||
|
||||
// TODO: Handle anything but roles...
|
||||
if newrole != "" {
|
||||
if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
rw.Write([]byte("Add Role Success"))
|
||||
} else if delrole != "" {
|
||||
if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
rw.Write([]byte("Remove Role Success"))
|
||||
} else if newproj != "" {
|
||||
if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
rw.Write([]byte("Add Project Success"))
|
||||
} else if delproj != "" {
|
||||
if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
rw.Write([]byte("Remove Project Success"))
|
||||
} else {
|
||||
http.Error(rw, "Not Add or Del [role|project]?", http.StatusInternalServerError)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestApi) createUser(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
me := repository.GetUserFromContext(r.Context())
|
||||
if !me.HasRole(schema.RoleAdmin) {
|
||||
http.Error(rw, "Only admins are allowed to create new users", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
username, password, role, name, email, project := r.FormValue("username"),
|
||||
r.FormValue("password"), r.FormValue("role"), r.FormValue("name"),
|
||||
r.FormValue("email"), r.FormValue("project")
|
||||
|
||||
if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) {
|
||||
http.Error(rw, "Only API users are allowed to have a blank password (login will be impossible)", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) {
|
||||
http.Error(rw, "only managers require a project (can be changed later)",
|
||||
http.StatusBadRequest)
|
||||
return
|
||||
} else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) {
|
||||
http.Error(rw, "managers require a project to manage (can be changed later)",
|
||||
http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if err := repository.GetUserRepository().AddUser(&schema.User{
|
||||
Username: username,
|
||||
Name: name,
|
||||
Password: password,
|
||||
Email: email,
|
||||
Projects: []string{project},
|
||||
Roles: []string{role},
|
||||
}); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Fprintf(rw, "User %v successfully created!\n", username)
|
||||
}
|
||||
|
||||
func (api *RestApi) deleteUser(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
http.Error(rw, "Only admins are allowed to delete a user", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
username := r.FormValue("username")
|
||||
if err := repository.GetUserRepository().DelUser(username); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
}
|
||||
98
internal/archiver/archiveWorker.go
Normal file
98
internal/archiver/archiveWorker.go
Normal file
@@ -0,0 +1,98 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package archiver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
var (
|
||||
archivePending sync.WaitGroup
|
||||
archiveChannel chan *schema.Job
|
||||
jobRepo *repository.JobRepository
|
||||
)
|
||||
|
||||
func Start(r *repository.JobRepository) {
|
||||
archiveChannel = make(chan *schema.Job, 128)
|
||||
jobRepo = r
|
||||
|
||||
go archivingWorker()
|
||||
}
|
||||
|
||||
// Archiving worker thread
|
||||
func archivingWorker() {
|
||||
for {
|
||||
select {
|
||||
case job, ok := <-archiveChannel:
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
start := time.Now()
|
||||
// not using meta data, called to load JobMeta into Cache?
|
||||
// will fail if job meta not in repository
|
||||
if _, err := jobRepo.FetchMetadata(job); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
|
||||
// TODO: Maybe use context with cancel/timeout here
|
||||
jobMeta, err := ArchiveJob(job, context.Background())
|
||||
if err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
stmt := sq.Update("job").Where("job.id = ?", job.ID)
|
||||
|
||||
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
// Update the jobs database entry one last time:
|
||||
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
|
||||
if err := jobRepo.Execute(stmt); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
cclog.Printf("archiving job (dbid: %d) successful", job.ID)
|
||||
|
||||
repository.CallJobStopHooks(job)
|
||||
archivePending.Done()
|
||||
default:
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Trigger async archiving
|
||||
func TriggerArchiving(job *schema.Job) {
|
||||
if archiveChannel == nil {
|
||||
cclog.Fatal("Cannot archive without archiving channel. Did you Start the archiver?")
|
||||
}
|
||||
|
||||
archivePending.Add(1)
|
||||
archiveChannel <- job
|
||||
}
|
||||
|
||||
// Wait for background thread to finish pending archiving operations
|
||||
func WaitForArchiving() {
|
||||
// close channel and wait for worker to process remaining jobs
|
||||
archivePending.Wait()
|
||||
}
|
||||
79
internal/archiver/archiver.go
Normal file
79
internal/archiver/archiver.go
Normal file
@@ -0,0 +1,79 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package archiver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// Writes a running job to the job-archive
|
||||
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
|
||||
allMetrics := make([]string, 0)
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
for _, mc := range metricConfigs {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||
// FIXME: Add a config option for this
|
||||
if job.NumNodes <= 8 {
|
||||
// This will add the native scope if core scope is not available
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
}
|
||||
|
||||
if job.NumAcc > 0 {
|
||||
scopes = append(scopes, schema.MetricScopeAccelerator)
|
||||
}
|
||||
|
||||
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
|
||||
if err != nil {
|
||||
cclog.Error("Error wile loading job data for archiving")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job.Statistics = make(map[string]schema.JobStatistics)
|
||||
|
||||
for metric, data := range jobData {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// This should never happen ?
|
||||
continue
|
||||
}
|
||||
|
||||
for _, series := range nodeData.Series {
|
||||
avg += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
}
|
||||
|
||||
// Round AVG Result to 2 Digits
|
||||
job.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: schema.Unit{
|
||||
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||
},
|
||||
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
|
||||
// If the file based archive is disabled,
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if config.Keys.DisableArchive {
|
||||
return job, nil
|
||||
}
|
||||
|
||||
return job, archive.GetHandle().ImportJob(job, &jobData)
|
||||
}
|
||||
@@ -1,23 +1,34 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package auth implements various authentication methods
|
||||
package auth
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"database/sql"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
"github.com/gorilla/sessions"
|
||||
)
|
||||
|
||||
@@ -26,6 +37,32 @@ type Authenticator interface {
|
||||
Login(user *schema.User, rw http.ResponseWriter, r *http.Request) (*schema.User, error)
|
||||
}
|
||||
|
||||
var (
|
||||
initOnce sync.Once
|
||||
authInstance *Authentication
|
||||
)
|
||||
|
||||
var ipUserLimiters sync.Map
|
||||
|
||||
func getIPUserLimiter(ip, username string) *rate.Limiter {
|
||||
key := ip + ":" + username
|
||||
limiter, ok := ipUserLimiters.Load(key)
|
||||
if !ok {
|
||||
newLimiter := rate.NewLimiter(rate.Every(time.Hour/10), 10)
|
||||
ipUserLimiters.Store(key, newLimiter)
|
||||
return newLimiter
|
||||
}
|
||||
return limiter.(*rate.Limiter)
|
||||
}
|
||||
|
||||
type AuthConfig struct {
|
||||
LdapConfig *LdapConfig `json:"ldap"`
|
||||
JwtConfig *JWTAuthConfig `json:"jwts"`
|
||||
OpenIDConfig *OpenIDConfig `json:"oidc"`
|
||||
}
|
||||
|
||||
var Keys AuthConfig
|
||||
|
||||
type Authentication struct {
|
||||
sessionStore *sessions.CookieStore
|
||||
LdapAuth *LdapAuthenticator
|
||||
@@ -41,7 +78,7 @@ func (auth *Authentication) AuthViaSession(
|
||||
) (*schema.User, error) {
|
||||
session, err := auth.sessionStore.Get(r, "session")
|
||||
if err != nil {
|
||||
log.Error("Error while getting session store")
|
||||
cclog.Error("Error while getting session store")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -62,82 +99,122 @@ func (auth *Authentication) AuthViaSession(
|
||||
}, nil
|
||||
}
|
||||
|
||||
func Init() (*Authentication, error) {
|
||||
auth := &Authentication{}
|
||||
func Init(authCfg *json.RawMessage) {
|
||||
initOnce.Do(func() {
|
||||
authInstance = &Authentication{}
|
||||
|
||||
sessKey := os.Getenv("SESSION_KEY")
|
||||
if sessKey == "" {
|
||||
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
|
||||
bytes := make([]byte, 32)
|
||||
if _, err := rand.Read(bytes); err != nil {
|
||||
log.Error("Error while initializing authentication -> failed to generate random bytes for session key")
|
||||
return nil, err
|
||||
}
|
||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
||||
} else {
|
||||
bytes, err := base64.StdEncoding.DecodeString(sessKey)
|
||||
if err != nil {
|
||||
log.Error("Error while initializing authentication -> decoding session key failed")
|
||||
return nil, err
|
||||
}
|
||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
||||
}
|
||||
|
||||
if config.Keys.LdapConfig != nil {
|
||||
ldapAuth := &LdapAuthenticator{}
|
||||
if err := ldapAuth.Init(); err != nil {
|
||||
log.Warn("Error while initializing authentication -> ldapAuth init failed")
|
||||
sessKey := os.Getenv("SESSION_KEY")
|
||||
if sessKey == "" {
|
||||
cclog.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
|
||||
bytes := make([]byte, 32)
|
||||
if _, err := rand.Read(bytes); err != nil {
|
||||
cclog.Fatal("Error while initializing authentication -> failed to generate random bytes for session key")
|
||||
}
|
||||
authInstance.sessionStore = sessions.NewCookieStore(bytes)
|
||||
} else {
|
||||
auth.LdapAuth = ldapAuth
|
||||
auth.authenticators = append(auth.authenticators, auth.LdapAuth)
|
||||
}
|
||||
} else {
|
||||
log.Info("Missing LDAP configuration: No LDAP support!")
|
||||
}
|
||||
|
||||
if config.Keys.JwtConfig != nil {
|
||||
auth.JwtAuth = &JWTAuthenticator{}
|
||||
if err := auth.JwtAuth.Init(); err != nil {
|
||||
log.Error("Error while initializing authentication -> jwtAuth init failed")
|
||||
return nil, err
|
||||
bytes, err := base64.StdEncoding.DecodeString(sessKey)
|
||||
if err != nil {
|
||||
cclog.Fatal("Error while initializing authentication -> decoding session key failed")
|
||||
}
|
||||
authInstance.sessionStore = sessions.NewCookieStore(bytes)
|
||||
}
|
||||
|
||||
jwtSessionAuth := &JWTSessionAuthenticator{}
|
||||
if err := jwtSessionAuth.Init(); err != nil {
|
||||
log.Info("jwtSessionAuth init failed: No JWT login support!")
|
||||
if d, err := time.ParseDuration(config.Keys.SessionMaxAge); err == nil {
|
||||
authInstance.SessionMaxAge = d
|
||||
}
|
||||
|
||||
if authCfg == nil {
|
||||
return
|
||||
}
|
||||
|
||||
config.Validate(configSchema, *authCfg)
|
||||
dec := json.NewDecoder(bytes.NewReader(*authCfg))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
cclog.Errorf("error while decoding ldap config: %v", err)
|
||||
}
|
||||
|
||||
if Keys.LdapConfig != nil {
|
||||
ldapAuth := &LdapAuthenticator{}
|
||||
if err := ldapAuth.Init(); err != nil {
|
||||
cclog.Warn("Error while initializing authentication -> ldapAuth init failed")
|
||||
} else {
|
||||
authInstance.LdapAuth = ldapAuth
|
||||
authInstance.authenticators = append(authInstance.authenticators, authInstance.LdapAuth)
|
||||
}
|
||||
} else {
|
||||
auth.authenticators = append(auth.authenticators, jwtSessionAuth)
|
||||
cclog.Info("Missing LDAP configuration: No LDAP support!")
|
||||
}
|
||||
|
||||
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
|
||||
if err := jwtCookieSessionAuth.Init(); err != nil {
|
||||
log.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
|
||||
if Keys.JwtConfig != nil {
|
||||
authInstance.JwtAuth = &JWTAuthenticator{}
|
||||
if err := authInstance.JwtAuth.Init(); err != nil {
|
||||
cclog.Fatal("Error while initializing authentication -> jwtAuth init failed")
|
||||
}
|
||||
|
||||
jwtSessionAuth := &JWTSessionAuthenticator{}
|
||||
if err := jwtSessionAuth.Init(); err != nil {
|
||||
cclog.Info("jwtSessionAuth init failed: No JWT login support!")
|
||||
} else {
|
||||
authInstance.authenticators = append(authInstance.authenticators, jwtSessionAuth)
|
||||
}
|
||||
|
||||
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
|
||||
if err := jwtCookieSessionAuth.Init(); err != nil {
|
||||
cclog.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
|
||||
} else {
|
||||
authInstance.authenticators = append(authInstance.authenticators, jwtCookieSessionAuth)
|
||||
}
|
||||
} else {
|
||||
auth.authenticators = append(auth.authenticators, jwtCookieSessionAuth)
|
||||
cclog.Info("Missing JWT configuration: No JWT token support!")
|
||||
}
|
||||
} else {
|
||||
log.Info("Missing JWT configuration: No JWT token support!")
|
||||
}
|
||||
|
||||
auth.LocalAuth = &LocalAuthenticator{}
|
||||
if err := auth.LocalAuth.Init(); err != nil {
|
||||
log.Error("Error while initializing authentication -> localAuth init failed")
|
||||
return nil, err
|
||||
}
|
||||
auth.authenticators = append(auth.authenticators, auth.LocalAuth)
|
||||
|
||||
return auth, nil
|
||||
authInstance.LocalAuth = &LocalAuthenticator{}
|
||||
if err := authInstance.LocalAuth.Init(); err != nil {
|
||||
cclog.Fatal("Error while initializing authentication -> localAuth init failed")
|
||||
}
|
||||
authInstance.authenticators = append(authInstance.authenticators, authInstance.LocalAuth)
|
||||
})
|
||||
}
|
||||
|
||||
func persistUser(user *schema.User) {
|
||||
func GetAuthInstance() *Authentication {
|
||||
if authInstance == nil {
|
||||
cclog.Fatal("Authentication module not initialized!")
|
||||
}
|
||||
|
||||
return authInstance
|
||||
}
|
||||
|
||||
func handleTokenUser(tokenUser *schema.User) {
|
||||
r := repository.GetUserRepository()
|
||||
_, err := r.GetUser(user.Username)
|
||||
dbUser, err := r.GetUser(tokenUser.Username)
|
||||
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%s': %v", user.Username, err)
|
||||
} else if err == sql.ErrNoRows {
|
||||
if err := r.AddUser(user); err != nil {
|
||||
log.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
|
||||
cclog.Errorf("Error while loading user '%s': %v", tokenUser.Username, err)
|
||||
} else if err == sql.ErrNoRows && Keys.JwtConfig.SyncUserOnLogin { // Adds New User
|
||||
if err := r.AddUser(tokenUser); err != nil {
|
||||
cclog.Errorf("Error while adding user '%s' to DB: %v", tokenUser.Username, err)
|
||||
}
|
||||
} else if err == nil && Keys.JwtConfig.UpdateUserOnLogin { // Update Existing User
|
||||
if err := r.UpdateUser(dbUser, tokenUser); err != nil {
|
||||
cclog.Errorf("Error while updating user '%s' to DB: %v", dbUser.Username, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func handleOIDCUser(OIDCUser *schema.User) {
|
||||
r := repository.GetUserRepository()
|
||||
dbUser, err := r.GetUser(OIDCUser.Username)
|
||||
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
cclog.Errorf("Error while loading user '%s': %v", OIDCUser.Username, err)
|
||||
} else if err == sql.ErrNoRows && Keys.OpenIDConfig.SyncUserOnLogin { // Adds New User
|
||||
if err := r.AddUser(OIDCUser); err != nil {
|
||||
cclog.Errorf("Error while adding user '%s' to DB: %v", OIDCUser.Username, err)
|
||||
}
|
||||
} else if err == nil && Keys.OpenIDConfig.UpdateUserOnLogin { // Update Existing User
|
||||
if err := r.UpdateUser(dbUser, OIDCUser); err != nil {
|
||||
cclog.Errorf("Error while updating user '%s' to DB: %v", dbUser.Username, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -145,7 +222,7 @@ func persistUser(user *schema.User) {
|
||||
func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
|
||||
session, err := auth.sessionStore.New(r, "session")
|
||||
if err != nil {
|
||||
log.Errorf("session creation failed: %s", err.Error())
|
||||
cclog.Errorf("session creation failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return err
|
||||
}
|
||||
@@ -153,11 +230,15 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
||||
if auth.SessionMaxAge != 0 {
|
||||
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
|
||||
}
|
||||
if config.Keys.HTTPSCertFile == "" && config.Keys.HTTPSKeyFile == "" {
|
||||
session.Options.Secure = false
|
||||
}
|
||||
session.Options.SameSite = http.SameSiteStrictMode
|
||||
session.Values["username"] = user.Username
|
||||
session.Values["projects"] = user.Projects
|
||||
session.Values["roles"] = user.Roles
|
||||
if err := auth.sessionStore.Save(r, rw, session); err != nil {
|
||||
log.Warnf("session save failed: %s", err.Error())
|
||||
cclog.Warnf("session save failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return err
|
||||
}
|
||||
@@ -166,18 +247,29 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
||||
}
|
||||
|
||||
func (auth *Authentication) Login(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
username := r.FormValue("username")
|
||||
var dbUser *schema.User
|
||||
ip, _, err := net.SplitHostPort(r.RemoteAddr)
|
||||
if err != nil {
|
||||
ip = r.RemoteAddr
|
||||
}
|
||||
|
||||
username := r.FormValue("username")
|
||||
|
||||
limiter := getIPUserLimiter(ip, username)
|
||||
if !limiter.Allow() {
|
||||
cclog.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username)
|
||||
onfailure(rw, r, errors.New("too many login attempts, try again in a few minutes"))
|
||||
return
|
||||
}
|
||||
|
||||
var dbUser *schema.User
|
||||
if username != "" {
|
||||
var err error
|
||||
dbUser, err = repository.GetUserRepository().GetUser(username)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%v'", username)
|
||||
cclog.Errorf("Error while loading user '%v'", username)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,12 +279,12 @@ func (auth *Authentication) Login(
|
||||
if user, ok = authenticator.CanLogin(dbUser, username, rw, r); !ok {
|
||||
continue
|
||||
} else {
|
||||
log.Debugf("Can login with user %v", user)
|
||||
cclog.Debugf("Can login with user %v", user)
|
||||
}
|
||||
|
||||
user, err := authenticator.Login(user, rw, r)
|
||||
if err != nil {
|
||||
log.Warnf("user login failed: %s", err.Error())
|
||||
cclog.Warnf("user login failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
@@ -201,13 +293,19 @@ func (auth *Authentication) Login(
|
||||
return
|
||||
}
|
||||
|
||||
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
|
||||
if r.FormValue("redirect") != "" {
|
||||
http.RedirectHandler(r.FormValue("redirect"), http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
http.RedirectHandler("/", http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
log.Debugf("login failed: no authenticator applied")
|
||||
cclog.Debugf("login failed: no authenticator applied")
|
||||
onfailure(rw, r, errors.New("no authenticator applied"))
|
||||
})
|
||||
}
|
||||
@@ -219,31 +317,186 @@ func (auth *Authentication) Auth(
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("authentication failed: %s", err.Error())
|
||||
cclog.Infof("auth -> authentication failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
|
||||
if user == nil {
|
||||
user, err = auth.AuthViaSession(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("authentication failed: %s", err.Error())
|
||||
cclog.Infof("auth -> authentication failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if user != nil {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
log.Debug("authentication failed")
|
||||
cclog.Info("auth -> authentication failed")
|
||||
onfailure(rw, r, errors.New("unauthorized (please login first)"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
|
||||
ipErr := securedCheck(user, r)
|
||||
if ipErr != nil {
|
||||
cclog.Infof("auth api -> secured check failed: %s", ipErr.Error())
|
||||
onfailure(rw, r, ipErr)
|
||||
return
|
||||
}
|
||||
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
default:
|
||||
cclog.Info("auth api -> authentication failed: missing role")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
}
|
||||
}
|
||||
cclog.Info("auth api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthUserAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth user api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
default:
|
||||
cclog.Info("auth user api -> authentication failed: missing role")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
}
|
||||
}
|
||||
cclog.Info("auth user api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthMetricStoreAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth metricstore api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
default:
|
||||
cclog.Info("auth metricstore api -> authentication failed: missing role")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
}
|
||||
}
|
||||
cclog.Info("auth metricstore api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthConfigAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.AuthViaSession(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth config api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
if user != nil && user.HasRole(schema.RoleAdmin) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
cclog.Info("auth config api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthFrontendAPI(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.AuthViaSession(rw, r)
|
||||
if err != nil {
|
||||
cclog.Infof("auth frontend api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
if user != nil {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
cclog.Info("auth frontend api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
session, err := auth.sessionStore.Get(r, "session")
|
||||
@@ -263,3 +516,39 @@ func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
|
||||
onsuccess.ServeHTTP(rw, r)
|
||||
})
|
||||
}
|
||||
|
||||
// Helper Moved To MiddleWare Auth Handlers
|
||||
func securedCheck(user *schema.User, r *http.Request) error {
|
||||
if user == nil {
|
||||
return fmt.Errorf("no user for secured check")
|
||||
}
|
||||
|
||||
// extract IP address for checking
|
||||
IPAddress := r.Header.Get("X-Real-Ip")
|
||||
if IPAddress == "" {
|
||||
IPAddress = r.Header.Get("X-Forwarded-For")
|
||||
}
|
||||
if IPAddress == "" {
|
||||
IPAddress = r.RemoteAddr
|
||||
}
|
||||
|
||||
// FIXME: IPV6 not handled
|
||||
if strings.Contains(IPAddress, ":") {
|
||||
IPAddress = strings.Split(IPAddress, ":")[0]
|
||||
}
|
||||
|
||||
// If nothing declared in config: deny all request to this api endpoint
|
||||
if len(config.Keys.APIAllowedIPs) == 0 {
|
||||
return fmt.Errorf("missing configuration key ApiAllowedIPs")
|
||||
}
|
||||
// If wildcard declared in config: Continue
|
||||
if config.Keys.APIAllowedIPs[0] == "*" {
|
||||
return nil
|
||||
}
|
||||
// check if IP is allowed
|
||||
if !util.Contains(config.Keys.APIAllowedIPs, IPAddress) {
|
||||
return fmt.Errorf("unknown ip: %v", IPAddress)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
@@ -13,13 +14,34 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
type JWTAuthConfig struct {
|
||||
// Specifies for how long a JWT token shall be valid
|
||||
// as a string parsable by time.ParseDuration().
|
||||
MaxAge string `json:"max-age"`
|
||||
|
||||
// Specifies which cookie should be checked for a JWT token (if no authorization header is present)
|
||||
CookieName string `json:"cookieName"`
|
||||
|
||||
// Deny login for users not in database (but defined in JWT).
|
||||
// Ignore user roles defined in JWTs ('roles' claim), get them from db.
|
||||
ValidateUser bool `json:"validateUser"`
|
||||
|
||||
// Specifies which issuer should be accepted when validating external JWTs ('iss' claim)
|
||||
TrustedIssuer string `json:"trustedIssuer"`
|
||||
|
||||
// Should an non-existent user be added to the DB based on the information in the token
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
|
||||
// Should an existent user be updated in the DB based on the information in the token
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
}
|
||||
|
||||
type JWTAuthenticator struct {
|
||||
publicKey ed25519.PublicKey
|
||||
privateKey ed25519.PrivateKey
|
||||
@@ -28,17 +50,17 @@ type JWTAuthenticator struct {
|
||||
func (ja *JWTAuthenticator) Init() error {
|
||||
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
|
||||
if pubKey == "" || privKey == "" {
|
||||
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
} else {
|
||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode JWT public key")
|
||||
cclog.Warn("Could not decode JWT public key")
|
||||
return err
|
||||
}
|
||||
ja.publicKey = ed25519.PublicKey(bytes)
|
||||
bytes, err = base64.StdEncoding.DecodeString(privKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode JWT private key")
|
||||
cclog.Warn("Could not decode JWT private key")
|
||||
return err
|
||||
}
|
||||
ja.privateKey = ed25519.PrivateKey(bytes)
|
||||
@@ -62,7 +84,7 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
|
||||
if t.Method != jwt.SigningMethodEdDSA {
|
||||
return nil, errors.New("only Ed25519/EdDSA supported")
|
||||
}
|
||||
@@ -70,11 +92,11 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
||||
return ja.publicKey, nil
|
||||
})
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing JWT token")
|
||||
cclog.Warn("Error while parsing JWT token")
|
||||
return nil, err
|
||||
}
|
||||
if !token.Valid {
|
||||
log.Warn("jwt token claims are not valid")
|
||||
cclog.Warn("jwt token claims are not valid")
|
||||
return nil, errors.New("jwt token claims are not valid")
|
||||
}
|
||||
|
||||
@@ -85,19 +107,19 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
||||
var roles []string
|
||||
|
||||
// Validate user + roles from JWT against database?
|
||||
if config.Keys.JwtConfig.ValidateUser {
|
||||
if Keys.JwtConfig.ValidateUser {
|
||||
ur := repository.GetUserRepository()
|
||||
user, err := ur.GetUser(sub)
|
||||
// Deny any logins for unknown usernames
|
||||
if err != nil {
|
||||
log.Warn("Could not find user from JWT in internal database.")
|
||||
cclog.Warn("Could not find user from JWT in internal database.")
|
||||
return nil, errors.New("unknown user")
|
||||
}
|
||||
// Take user roles from database instead of trusting the JWT
|
||||
roles = user.Roles
|
||||
} else {
|
||||
// Extract roles from JWT (if present)
|
||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
||||
if rawroles, ok := claims["roles"].([]any); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
roles = append(roles, r)
|
||||
@@ -114,7 +136,7 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Generate a new JWT that can be used for authentication
|
||||
// ProvideJWT generates a new JWT that can be used for authentication
|
||||
func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
|
||||
if ja.privateKey == nil {
|
||||
return "", errors.New("environment variable 'JWT_PRIVATE_KEY' not set")
|
||||
@@ -126,8 +148,8 @@ func (ja *JWTAuthenticator) ProvideJWT(user *schema.User) (string, error) {
|
||||
"roles": user.Roles,
|
||||
"iat": now.Unix(),
|
||||
}
|
||||
if config.Keys.JwtConfig.MaxAge != "" {
|
||||
d, err := time.ParseDuration(config.Keys.JwtConfig.MaxAge)
|
||||
if Keys.JwtConfig.MaxAge != "" {
|
||||
d, err := time.ParseDuration(Keys.JwtConfig.MaxAge)
|
||||
if err != nil {
|
||||
return "", errors.New("cannot parse max-age config key")
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
@@ -13,10 +14,9 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -31,18 +31,18 @@ var _ Authenticator = (*JWTCookieSessionAuthenticator)(nil)
|
||||
func (ja *JWTCookieSessionAuthenticator) Init() error {
|
||||
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
|
||||
if pubKey == "" || privKey == "" {
|
||||
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
cclog.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
return errors.New("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
} else {
|
||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode JWT public key")
|
||||
cclog.Warn("Could not decode JWT public key")
|
||||
return err
|
||||
}
|
||||
ja.publicKey = ed25519.PublicKey(bytes)
|
||||
bytes, err = base64.StdEncoding.DecodeString(privKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode JWT private key")
|
||||
cclog.Warn("Could not decode JWT private key")
|
||||
return err
|
||||
}
|
||||
ja.privateKey = ed25519.PrivateKey(bytes)
|
||||
@@ -53,36 +53,35 @@ func (ja *JWTCookieSessionAuthenticator) Init() error {
|
||||
if keyFound && pubKeyCrossLogin != "" {
|
||||
bytes, err := base64.StdEncoding.DecodeString(pubKeyCrossLogin)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode cross login JWT public key")
|
||||
cclog.Warn("Could not decode cross login JWT public key")
|
||||
return err
|
||||
}
|
||||
ja.publicKeyCrossLogin = ed25519.PublicKey(bytes)
|
||||
} else {
|
||||
ja.publicKeyCrossLogin = nil
|
||||
log.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
||||
cclog.Debug("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
||||
return errors.New("environment variable 'CROSS_LOGIN_JWT_PUBLIC_KEY' not set (cross login token based authentication will not work)")
|
||||
}
|
||||
|
||||
jc := config.Keys.JwtConfig
|
||||
// Warn if other necessary settings are not configured
|
||||
if jc != nil {
|
||||
if jc.CookieName == "" {
|
||||
log.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
if Keys.JwtConfig != nil {
|
||||
if Keys.JwtConfig.CookieName == "" {
|
||||
cclog.Info("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
return errors.New("cookieName for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
}
|
||||
if !jc.ValidateUser {
|
||||
log.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
|
||||
if !Keys.JwtConfig.ValidateUser {
|
||||
cclog.Info("forceJWTValidationViaDatabase not set to true: CC will accept users and roles defined in JWTs regardless of its own database!")
|
||||
}
|
||||
if jc.TrustedIssuer == "" {
|
||||
log.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
if Keys.JwtConfig.TrustedIssuer == "" {
|
||||
cclog.Info("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
return errors.New("trustedExternalIssuer for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
}
|
||||
} else {
|
||||
log.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
cclog.Warn("config for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
return errors.New("config for JWTs not configured (cross login via JWT cookie will fail)")
|
||||
}
|
||||
|
||||
log.Info("JWT Cookie Session authenticator successfully registered")
|
||||
cclog.Info("JWT Cookie Session authenticator successfully registered")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -92,7 +91,7 @@ func (ja *JWTCookieSessionAuthenticator) CanLogin(
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request,
|
||||
) (*schema.User, bool) {
|
||||
jc := config.Keys.JwtConfig
|
||||
jc := Keys.JwtConfig
|
||||
cookieName := ""
|
||||
if jc.CookieName != "" {
|
||||
cookieName = jc.CookieName
|
||||
@@ -115,7 +114,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request,
|
||||
) (*schema.User, error) {
|
||||
jc := config.Keys.JwtConfig
|
||||
jc := Keys.JwtConfig
|
||||
jwtCookie, err := r.Cookie(jc.CookieName)
|
||||
var rawtoken string
|
||||
|
||||
@@ -123,7 +122,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
rawtoken = jwtCookie.Value
|
||||
}
|
||||
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
|
||||
if t.Method != jwt.SigningMethodEdDSA {
|
||||
return nil, errors.New("only Ed25519/EdDSA supported")
|
||||
}
|
||||
@@ -140,12 +139,12 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
return ja.publicKey, nil
|
||||
})
|
||||
if err != nil {
|
||||
log.Warn("JWT cookie session: error while parsing token")
|
||||
cclog.Warn("JWT cookie session: error while parsing token")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !token.Valid {
|
||||
log.Warn("jwt token claims are not valid")
|
||||
cclog.Warn("jwt token claims are not valid")
|
||||
return nil, errors.New("jwt token claims are not valid")
|
||||
}
|
||||
|
||||
@@ -159,18 +158,18 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
var err error
|
||||
user, err = repository.GetUserRepository().GetUser(sub)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%v'", sub)
|
||||
cclog.Errorf("Error while loading user '%v'", sub)
|
||||
}
|
||||
|
||||
// Deny any logins for unknown usernames
|
||||
if user == nil {
|
||||
log.Warn("Could not find user from JWT in internal database.")
|
||||
cclog.Warn("Could not find user from JWT in internal database.")
|
||||
return nil, errors.New("unknown user")
|
||||
}
|
||||
} else {
|
||||
var name string
|
||||
if wrap, ok := claims["name"].(map[string]interface{}); ok {
|
||||
if vals, ok := wrap["values"].([]interface{}); ok {
|
||||
if wrap, ok := claims["name"].(map[string]any); ok {
|
||||
if vals, ok := wrap["values"].([]any); ok {
|
||||
if len(vals) != 0 {
|
||||
name = fmt.Sprintf("%v", vals[0])
|
||||
|
||||
@@ -182,7 +181,7 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
}
|
||||
|
||||
// Extract roles from JWT (if present)
|
||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
||||
if rawroles, ok := claims["roles"].([]any); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
roles = append(roles, r)
|
||||
@@ -198,8 +197,8 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
AuthSource: schema.AuthViaToken,
|
||||
}
|
||||
|
||||
if jc.SyncUserOnLogin {
|
||||
persistUser(user)
|
||||
if jc.SyncUserOnLogin || jc.UpdateUserOnLogin {
|
||||
handleTokenUser(user)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
@@ -13,10 +14,9 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -30,13 +30,13 @@ func (ja *JWTSessionAuthenticator) Init() error {
|
||||
if pubKey := os.Getenv("CROSS_LOGIN_JWT_HS512_KEY"); pubKey != "" {
|
||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||
if err != nil {
|
||||
log.Warn("Could not decode cross login JWT HS512 key")
|
||||
cclog.Warn("Could not decode cross login JWT HS512 key")
|
||||
return err
|
||||
}
|
||||
ja.loginTokenKey = bytes
|
||||
}
|
||||
|
||||
log.Info("JWT Session authenticator successfully registered")
|
||||
cclog.Info("JWT Session authenticator successfully registered")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -60,19 +60,19 @@ func (ja *JWTSessionAuthenticator) Login(
|
||||
rawtoken = r.URL.Query().Get("login-token")
|
||||
}
|
||||
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (any, error) {
|
||||
if t.Method == jwt.SigningMethodHS256 || t.Method == jwt.SigningMethodHS512 {
|
||||
return ja.loginTokenKey, nil
|
||||
}
|
||||
return nil, fmt.Errorf("unkown signing method for login token: %s (known: HS256, HS512, EdDSA)", t.Method.Alg())
|
||||
})
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing jwt token")
|
||||
cclog.Warn("Error while parsing jwt token")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !token.Valid {
|
||||
log.Warn("jwt token claims are not valid")
|
||||
cclog.Warn("jwt token claims are not valid")
|
||||
return nil, errors.New("jwt token claims are not valid")
|
||||
}
|
||||
|
||||
@@ -82,22 +82,22 @@ func (ja *JWTSessionAuthenticator) Login(
|
||||
var roles []string
|
||||
projects := make([]string, 0)
|
||||
|
||||
if config.Keys.JwtConfig.ValidateUser {
|
||||
if Keys.JwtConfig.ValidateUser {
|
||||
var err error
|
||||
user, err = repository.GetUserRepository().GetUser(sub)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%v'", sub)
|
||||
cclog.Errorf("Error while loading user '%v'", sub)
|
||||
}
|
||||
|
||||
// Deny any logins for unknown usernames
|
||||
if user == nil {
|
||||
log.Warn("Could not find user from JWT in internal database.")
|
||||
cclog.Warn("Could not find user from JWT in internal database.")
|
||||
return nil, errors.New("unknown user")
|
||||
}
|
||||
} else {
|
||||
var name string
|
||||
if wrap, ok := claims["name"].(map[string]interface{}); ok {
|
||||
if vals, ok := wrap["values"].([]interface{}); ok {
|
||||
if wrap, ok := claims["name"].(map[string]any); ok {
|
||||
if vals, ok := wrap["values"].([]any); ok {
|
||||
if len(vals) != 0 {
|
||||
name = fmt.Sprintf("%v", vals[0])
|
||||
|
||||
@@ -109,7 +109,7 @@ func (ja *JWTSessionAuthenticator) Login(
|
||||
}
|
||||
|
||||
// Extract roles from JWT (if present)
|
||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
||||
if rawroles, ok := claims["roles"].([]any); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
if schema.IsValidRole(r) {
|
||||
@@ -119,7 +119,7 @@ func (ja *JWTSessionAuthenticator) Login(
|
||||
}
|
||||
}
|
||||
|
||||
if rawprojs, ok := claims["projects"].([]interface{}); ok {
|
||||
if rawprojs, ok := claims["projects"].([]any); ok {
|
||||
for _, pp := range rawprojs {
|
||||
if p, ok := pp.(string); ok {
|
||||
projects = append(projects, p)
|
||||
@@ -138,8 +138,8 @@ func (ja *JWTSessionAuthenticator) Login(
|
||||
AuthSource: schema.AuthViaToken,
|
||||
}
|
||||
|
||||
if config.Keys.JwtConfig.SyncUserOnLogin {
|
||||
persistUser(user)
|
||||
if Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin {
|
||||
handleTokenUser(user)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
@@ -10,15 +11,27 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/go-ldap/ldap/v3"
|
||||
)
|
||||
|
||||
type LdapConfig struct {
|
||||
URL string `json:"url"`
|
||||
UserBase string `json:"user_base"`
|
||||
SearchDN string `json:"search_dn"`
|
||||
UserBind string `json:"user_bind"`
|
||||
UserFilter string `json:"user_filter"`
|
||||
UserAttr string `json:"username_attr"`
|
||||
SyncInterval string `json:"sync_interval"` // Parsed using time.ParseDuration.
|
||||
SyncDelOldUsers bool `json:"sync_del_old_users"`
|
||||
|
||||
// Should an non-existent user be added to the DB if user exists in ldap directory
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
}
|
||||
|
||||
type LdapAuthenticator struct {
|
||||
syncPassword string
|
||||
UserAttr string
|
||||
@@ -29,40 +42,11 @@ var _ Authenticator = (*LdapAuthenticator)(nil)
|
||||
func (la *LdapAuthenticator) Init() error {
|
||||
la.syncPassword = os.Getenv("LDAP_ADMIN_PASSWORD")
|
||||
if la.syncPassword == "" {
|
||||
log.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
|
||||
cclog.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync will not work)")
|
||||
}
|
||||
|
||||
lc := config.Keys.LdapConfig
|
||||
|
||||
if lc.SyncInterval != "" {
|
||||
interval, err := time.ParseDuration(lc.SyncInterval)
|
||||
if err != nil {
|
||||
log.Warnf("Could not parse duration for sync interval: %v",
|
||||
lc.SyncInterval)
|
||||
return err
|
||||
}
|
||||
|
||||
if interval == 0 {
|
||||
log.Info("Sync interval is zero")
|
||||
return nil
|
||||
}
|
||||
|
||||
go func() {
|
||||
ticker := time.NewTicker(interval)
|
||||
for t := range ticker.C {
|
||||
log.Printf("sync started at %s", t.Format(time.RFC3339))
|
||||
if err := la.Sync(); err != nil {
|
||||
log.Errorf("sync failed: %s", err.Error())
|
||||
}
|
||||
log.Print("sync done")
|
||||
}
|
||||
}()
|
||||
} else {
|
||||
log.Info("LDAP configuration key sync_interval invalid")
|
||||
}
|
||||
|
||||
if lc.UserAttr != "" {
|
||||
la.UserAttr = lc.UserAttr
|
||||
if Keys.LdapConfig.UserAttr != "" {
|
||||
la.UserAttr = Keys.LdapConfig.UserAttr
|
||||
} else {
|
||||
la.UserAttr = "gecos"
|
||||
}
|
||||
@@ -76,7 +60,7 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request,
|
||||
) (*schema.User, bool) {
|
||||
lc := config.Keys.LdapConfig
|
||||
lc := Keys.LdapConfig
|
||||
|
||||
if user != nil {
|
||||
if user.AuthSource == schema.AuthViaLDAP {
|
||||
@@ -86,7 +70,7 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
if lc.SyncUserOnLogin {
|
||||
l, err := la.getLdapConnection(true)
|
||||
if err != nil {
|
||||
log.Error("LDAP connection error")
|
||||
cclog.Error("LDAP connection error")
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
@@ -99,12 +83,12 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
|
||||
sr, err := l.Search(searchRequest)
|
||||
if err != nil {
|
||||
log.Warn(err)
|
||||
cclog.Warn(err)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
if len(sr.Entries) != 1 {
|
||||
log.Warn("LDAP: User does not exist or too many entries returned")
|
||||
cclog.Warn("LDAP: User does not exist or too many entries returned")
|
||||
return nil, false
|
||||
}
|
||||
|
||||
@@ -124,7 +108,7 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
}
|
||||
|
||||
if err := repository.GetUserRepository().AddUser(user); err != nil {
|
||||
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
@@ -142,14 +126,14 @@ func (la *LdapAuthenticator) Login(
|
||||
) (*schema.User, error) {
|
||||
l, err := la.getLdapConnection(false)
|
||||
if err != nil {
|
||||
log.Warn("Error while getting ldap connection")
|
||||
cclog.Warn("Error while getting ldap connection")
|
||||
return nil, err
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
userDn := strings.Replace(config.Keys.LdapConfig.UserBind, "{username}", user.Username, -1)
|
||||
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username)
|
||||
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
|
||||
log.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
|
||||
cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
|
||||
user.Username, err)
|
||||
return nil, fmt.Errorf("Authentication failed")
|
||||
}
|
||||
@@ -158,11 +142,11 @@ func (la *LdapAuthenticator) Login(
|
||||
}
|
||||
|
||||
func (la *LdapAuthenticator) Sync() error {
|
||||
const IN_DB int = 1
|
||||
const IN_LDAP int = 2
|
||||
const IN_BOTH int = 3
|
||||
const InDB int = 1
|
||||
const InLdap int = 2
|
||||
const InBoth int = 3
|
||||
ur := repository.GetUserRepository()
|
||||
lc := config.Keys.LdapConfig
|
||||
lc := Keys.LdapConfig
|
||||
|
||||
users := map[string]int{}
|
||||
usernames, err := ur.GetLdapUsernames()
|
||||
@@ -171,12 +155,12 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
}
|
||||
|
||||
for _, username := range usernames {
|
||||
users[username] = IN_DB
|
||||
users[username] = InDB
|
||||
}
|
||||
|
||||
l, err := la.getLdapConnection(true)
|
||||
if err != nil {
|
||||
log.Error("LDAP connection error")
|
||||
cclog.Error("LDAP connection error")
|
||||
return err
|
||||
}
|
||||
defer l.Close()
|
||||
@@ -187,7 +171,7 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
lc.UserFilter,
|
||||
[]string{"dn", "uid", la.UserAttr}, nil))
|
||||
if err != nil {
|
||||
log.Warn("LDAP search error")
|
||||
cclog.Warn("LDAP search error")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -200,18 +184,18 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
|
||||
_, ok := users[username]
|
||||
if !ok {
|
||||
users[username] = IN_LDAP
|
||||
users[username] = InLdap
|
||||
newnames[username] = entry.GetAttributeValue(la.UserAttr)
|
||||
} else {
|
||||
users[username] = IN_BOTH
|
||||
users[username] = InBoth
|
||||
}
|
||||
}
|
||||
|
||||
for username, where := range users {
|
||||
if where == IN_DB && lc.SyncDelOldUsers {
|
||||
if where == InDB && lc.SyncDelOldUsers {
|
||||
ur.DelUser(username)
|
||||
log.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
|
||||
} else if where == IN_LDAP {
|
||||
cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
|
||||
} else if where == InLdap {
|
||||
name := newnames[username]
|
||||
|
||||
var roles []string
|
||||
@@ -226,9 +210,9 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
AuthSource: schema.AuthViaLDAP,
|
||||
}
|
||||
|
||||
log.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
|
||||
cclog.Debugf("sync: add %v (name: %v, roles: [user], ldap: true)", username, name)
|
||||
if err := ur.AddUser(user); err != nil {
|
||||
log.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
return err
|
||||
}
|
||||
}
|
||||
@@ -238,17 +222,17 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
}
|
||||
|
||||
func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
|
||||
lc := config.Keys.LdapConfig
|
||||
conn, err := ldap.DialURL(lc.Url)
|
||||
lc := Keys.LdapConfig
|
||||
conn, err := ldap.DialURL(lc.URL)
|
||||
if err != nil {
|
||||
log.Warn("LDAP URL dial failed")
|
||||
cclog.Warn("LDAP URL dial failed")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if admin {
|
||||
if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {
|
||||
conn.Close()
|
||||
log.Warn("LDAP connection bind failed")
|
||||
cclog.Warn("LDAP connection bind failed")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"golang.org/x/crypto/bcrypt"
|
||||
)
|
||||
|
||||
@@ -27,19 +28,19 @@ func (la *LocalAuthenticator) CanLogin(
|
||||
user *schema.User,
|
||||
username string,
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request) (*schema.User, bool) {
|
||||
|
||||
r *http.Request,
|
||||
) (*schema.User, bool) {
|
||||
return user, user != nil && user.AuthSource == schema.AuthViaLocalPassword
|
||||
}
|
||||
|
||||
func (la *LocalAuthenticator) Login(
|
||||
user *schema.User,
|
||||
rw http.ResponseWriter,
|
||||
r *http.Request) (*schema.User, error) {
|
||||
|
||||
r *http.Request,
|
||||
) (*schema.User, error) {
|
||||
if e := bcrypt.CompareHashAndPassword([]byte(user.Password),
|
||||
[]byte(r.FormValue("password"))); e != nil {
|
||||
log.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
|
||||
cclog.Errorf("AUTH/LOCAL > Authentication for user %s failed!", user.Username)
|
||||
return nil, fmt.Errorf("Authentication failed")
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
import (
|
||||
@@ -13,15 +14,20 @@ import (
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/coreos/go-oidc/v3/oidc"
|
||||
"github.com/gorilla/mux"
|
||||
"golang.org/x/oauth2"
|
||||
)
|
||||
|
||||
type OpenIDConfig struct {
|
||||
Provider string `json:"provider"`
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
}
|
||||
|
||||
type OIDC struct {
|
||||
client *oauth2.Config
|
||||
provider *oidc.Provider
|
||||
@@ -49,17 +55,17 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin
|
||||
}
|
||||
|
||||
func NewOIDC(a *Authentication) *OIDC {
|
||||
provider, err := oidc.NewProvider(context.Background(), config.Keys.OpenIDConfig.Provider)
|
||||
provider, err := oidc.NewProvider(context.Background(), Keys.OpenIDConfig.Provider)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
clientID := os.Getenv("OID_CLIENT_ID")
|
||||
if clientID == "" {
|
||||
log.Warn("environment variable 'OID_CLIENT_ID' not set (Open ID connect auth will not work)")
|
||||
cclog.Warn("environment variable 'OID_CLIENT_ID' not set (Open ID connect auth will not work)")
|
||||
}
|
||||
clientSecret := os.Getenv("OID_CLIENT_SECRET")
|
||||
if clientSecret == "" {
|
||||
log.Warn("environment variable 'OID_CLIENT_SECRET' not set (Open ID connect auth will not work)")
|
||||
cclog.Warn("environment variable 'OID_CLIENT_SECRET' not set (Open ID connect auth will not work)")
|
||||
}
|
||||
|
||||
client := &oauth2.Config{
|
||||
@@ -168,12 +174,12 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
||||
AuthSource: schema.AuthViaOIDC,
|
||||
}
|
||||
|
||||
if config.Keys.OpenIDConfig.SyncUserOnLogin {
|
||||
persistUser(user)
|
||||
if Keys.OpenIDConfig.SyncUserOnLogin || Keys.OpenIDConfig.UpdateUserOnLogin {
|
||||
handleOIDCUser(user)
|
||||
}
|
||||
|
||||
oa.authentication.SaveSession(rw, r, user)
|
||||
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(ctx))
|
||||
}
|
||||
|
||||
96
internal/auth/schema.go
Normal file
96
internal/auth/schema.go
Normal file
@@ -0,0 +1,96 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package auth
|
||||
|
||||
var configSchema = `
|
||||
{
|
||||
"jwts": {
|
||||
"description": "For JWT token authentication.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"max-age": {
|
||||
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
|
||||
"type": "string"
|
||||
},
|
||||
"cookieName": {
|
||||
"description": "Cookie that should be checked for a JWT token.",
|
||||
"type": "string"
|
||||
},
|
||||
"validateUser": {
|
||||
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"trustedIssuer": {
|
||||
"description": "Issuer that should be accepted when validating external JWTs ",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["max-age"]
|
||||
},
|
||||
"oidc": {
|
||||
"provider": {
|
||||
"description": "",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
},
|
||||
"updateUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
},
|
||||
"required": ["provider"]
|
||||
},
|
||||
"ldap": {
|
||||
"description": "For LDAP Authentication and user synchronisation.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"description": "URL of LDAP directory server.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_base": {
|
||||
"description": "Base DN of user tree root.",
|
||||
"type": "string"
|
||||
},
|
||||
"search_dn": {
|
||||
"description": "DN for authenticating LDAP admin account with general read rights.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_bind": {
|
||||
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_filter": {
|
||||
"description": "Filter to extract users for syncing.",
|
||||
"type": "string"
|
||||
},
|
||||
"username_attr": {
|
||||
"description": "Attribute with full username. Default: gecos",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_interval": {
|
||||
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_del_old_users": {
|
||||
"description": "Delete obsolete users in database.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["url", "user_base", "search_dn", "user_bind", "user_filter"]
|
||||
},
|
||||
"required": ["jwts"]
|
||||
}`
|
||||
@@ -1,73 +1,143 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package config implements the program configuration data structures, validation and parsing
|
||||
package config
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
var Keys schema.ProgramConfig = schema.ProgramConfig{
|
||||
type ProgramConfig struct {
|
||||
// Address where the http (or https) server will listen on (for example: 'localhost:80').
|
||||
Addr string `json:"addr"`
|
||||
|
||||
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
|
||||
APIAllowedIPs []string `json:"apiAllowedIPs"`
|
||||
|
||||
// Drop root permissions once .env was read and the port was taken.
|
||||
User string `json:"user"`
|
||||
Group string `json:"group"`
|
||||
|
||||
// Disable authentication (for everything: API, Web-UI, ...)
|
||||
DisableAuthentication bool `json:"disable-authentication"`
|
||||
|
||||
// If `embed-static-files` is true (default), the frontend files are directly
|
||||
// embeded into the go binary and expected to be in web/frontend. Only if
|
||||
// it is false the files in `static-files` are served instead.
|
||||
EmbedStaticFiles bool `json:"embed-static-files"`
|
||||
StaticFiles string `json:"static-files"`
|
||||
|
||||
// 'sqlite3' or 'mysql' (mysql will work for mariadb as well)
|
||||
DBDriver string `json:"db-driver"`
|
||||
|
||||
// For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).
|
||||
DB string `json:"db"`
|
||||
|
||||
// Keep all metric data in the metric data repositories,
|
||||
// do not write to the job-archive.
|
||||
DisableArchive bool `json:"disable-archive"`
|
||||
|
||||
EnableJobTaggers bool `json:"enable-job-taggers"`
|
||||
|
||||
// Validate json input against schema
|
||||
Validate bool `json:"validate"`
|
||||
|
||||
// If 0 or empty, the session does not expire!
|
||||
SessionMaxAge string `json:"session-max-age"`
|
||||
|
||||
// If both those options are not empty, use HTTPS using those certificates.
|
||||
HTTPSCertFile string `json:"https-cert-file"`
|
||||
HTTPSKeyFile string `json:"https-key-file"`
|
||||
|
||||
// If not the empty string and `addr` does not end in ":80",
|
||||
// redirect every request incoming at port 80 to that url.
|
||||
RedirectHTTPTo string `json:"redirect-http-to"`
|
||||
|
||||
// Where to store MachineState files
|
||||
MachineStateDir string `json:"machine-state-dir"`
|
||||
|
||||
// If not zero, automatically mark jobs as stopped running X seconds longer than their walltime.
|
||||
StopJobsExceedingWalltime int `json:"stop-jobs-exceeding-walltime"`
|
||||
|
||||
// Defines time X in seconds in which jobs are considered to be "short" and will be filtered in specific views.
|
||||
ShortRunningJobsDuration int `json:"short-running-jobs-duration"`
|
||||
|
||||
// Energy Mix CO2 Emission Constant [g/kWh]
|
||||
// If entered, displays estimated CO2 emission for job based on jobs totalEnergy
|
||||
EmissionConstant int `json:"emission-constant"`
|
||||
|
||||
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
|
||||
EnableResampling *ResampleConfig `json:"resampling"`
|
||||
}
|
||||
|
||||
type ResampleConfig struct {
|
||||
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
|
||||
Resolutions []int `json:"resolutions"`
|
||||
// Trigger next zoom level at less than this many visible datapoints
|
||||
Trigger int `json:"trigger"`
|
||||
}
|
||||
|
||||
type IntRange struct {
|
||||
From int `json:"from"`
|
||||
To int `json:"to"`
|
||||
}
|
||||
|
||||
type TimeRange struct {
|
||||
From *time.Time `json:"from"`
|
||||
To *time.Time `json:"to"`
|
||||
Range string `json:"range,omitempty"`
|
||||
}
|
||||
|
||||
type FilterRanges struct {
|
||||
Duration *IntRange `json:"duration"`
|
||||
NumNodes *IntRange `json:"numNodes"`
|
||||
StartTime *TimeRange `json:"startTime"`
|
||||
}
|
||||
|
||||
type ClusterConfig struct {
|
||||
Name string `json:"name"`
|
||||
FilterRanges *FilterRanges `json:"filterRanges"`
|
||||
MetricDataRepository json.RawMessage `json:"metricDataRepository"`
|
||||
}
|
||||
|
||||
var Clusters []*ClusterConfig
|
||||
|
||||
var Keys ProgramConfig = ProgramConfig{
|
||||
Addr: "localhost:8080",
|
||||
DisableAuthentication: false,
|
||||
EmbedStaticFiles: true,
|
||||
DBDriver: "sqlite3",
|
||||
DB: "./var/job.db",
|
||||
Archive: json.RawMessage(`{\"kind\":\"file\",\"path\":\"./var/job-archive\"}`),
|
||||
DisableArchive: false,
|
||||
Validate: false,
|
||||
SessionMaxAge: "168h",
|
||||
StopJobsExceedingWalltime: 0,
|
||||
ShortRunningJobsDuration: 5 * 60,
|
||||
UiDefaults: map[string]interface{}{
|
||||
"analysis_view_histogramMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"analysis_view_scatterPlotMetrics": [][]string{{"flops_any", "mem_bw"}, {"flops_any", "cpu_load"}, {"cpu_load", "mem_bw"}},
|
||||
"job_view_nodestats_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"job_view_polarPlotMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"job_view_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"job_view_showFootprint": true,
|
||||
"job_list_usePaging": true,
|
||||
"plot_general_colorBackground": true,
|
||||
"plot_general_colorscheme": []string{"#00bfff", "#0000ff", "#ff00ff", "#ff0000", "#ff8000", "#ffff00", "#80ff00"},
|
||||
"plot_general_lineWidth": 3,
|
||||
"plot_list_jobsPerPage": 50,
|
||||
"plot_list_selectedMetrics": []string{"cpu_load", "mem_used", "flops_any", "mem_bw"},
|
||||
"plot_view_plotsPerRow": 3,
|
||||
"plot_view_showPolarplot": true,
|
||||
"plot_view_showRoofline": true,
|
||||
"plot_view_showStatTable": true,
|
||||
"system_view_selectedMetric": "cpu_load",
|
||||
"analysis_view_selectedTopEntity": "user",
|
||||
"analysis_view_selectedTopCategory": "totalWalltime",
|
||||
"status_view_selectedTopUserCategory": "totalJobs",
|
||||
"status_view_selectedTopProjectCategory": "totalJobs",
|
||||
},
|
||||
}
|
||||
|
||||
func Init(flagConfigFile string) {
|
||||
raw, err := os.ReadFile(flagConfigFile)
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
log.Fatalf("CONFIG ERROR: %v", err)
|
||||
}
|
||||
} else {
|
||||
if err := schema.Validate(schema.Config, bytes.NewReader(raw)); err != nil {
|
||||
log.Fatalf("Validate config: %v\n", err)
|
||||
}
|
||||
dec := json.NewDecoder(bytes.NewReader(raw))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
log.Fatalf("could not decode: %v", err)
|
||||
}
|
||||
func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
|
||||
Validate(configSchema, mainConfig)
|
||||
dec := json.NewDecoder(bytes.NewReader(mainConfig))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
|
||||
}
|
||||
|
||||
if Keys.Clusters == nil || len(Keys.Clusters) < 1 {
|
||||
log.Fatal("At least one cluster required in config!")
|
||||
}
|
||||
Validate(clustersSchema, clusterConfig)
|
||||
dec = json.NewDecoder(bytes.NewReader(clusterConfig))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Clusters); err != nil {
|
||||
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
|
||||
}
|
||||
|
||||
if len(Clusters) < 1 {
|
||||
cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,30 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package config
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func TestInit(t *testing.T) {
|
||||
fp := "../../configs/config.json"
|
||||
Init(fp)
|
||||
ccconf.Init(fp)
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
|
||||
if Keys.Addr != "0.0.0.0:443" {
|
||||
t.Errorf("wrong addr\ngot: %s \nwant: 0.0.0.0:443", Keys.Addr)
|
||||
}
|
||||
@@ -18,7 +32,17 @@ func TestInit(t *testing.T) {
|
||||
|
||||
func TestInitMinimal(t *testing.T) {
|
||||
fp := "../../configs/config-demo.json"
|
||||
Init(fp)
|
||||
ccconf.Init(fp)
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
|
||||
if Keys.Addr != "127.0.0.1:8080" {
|
||||
t.Errorf("wrong addr\ngot: %s \nwant: 127.0.0.1:8080", Keys.Addr)
|
||||
}
|
||||
|
||||
51
internal/config/default_metrics.go
Normal file
51
internal/config/default_metrics.go
Normal file
@@ -0,0 +1,51 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package config
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DEPRECATED: SUPERSEDED BY NEW USER CONFIG - userConfig.go / web.go
|
||||
|
||||
type DefaultMetricsCluster struct {
|
||||
Name string `json:"name"`
|
||||
DefaultMetrics string `json:"default_metrics"`
|
||||
}
|
||||
|
||||
type DefaultMetricsConfig struct {
|
||||
Clusters []DefaultMetricsCluster `json:"clusters"`
|
||||
}
|
||||
|
||||
func LoadDefaultMetricsConfig() (*DefaultMetricsConfig, error) {
|
||||
filePath := "default_metrics.json"
|
||||
if _, err := os.Stat(filePath); os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
data, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var cfg DefaultMetricsConfig
|
||||
if err := json.Unmarshal(data, &cfg); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &cfg, nil
|
||||
}
|
||||
|
||||
func ParseMetricsString(s string) []string {
|
||||
parts := strings.Split(s, ",")
|
||||
var metrics []string
|
||||
for _, p := range parts {
|
||||
trimmed := strings.TrimSpace(p)
|
||||
if trimmed != "" {
|
||||
metrics = append(metrics, trimmed)
|
||||
}
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
201
internal/config/schema.go
Normal file
201
internal/config/schema.go
Normal file
@@ -0,0 +1,201 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package config
|
||||
|
||||
var configSchema = `
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"addr": {
|
||||
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
|
||||
"type": "string"
|
||||
},
|
||||
"apiAllowedIPs": {
|
||||
"description": "Addresses from which secured API endpoints can be reached",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"user": {
|
||||
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||
"type": "string"
|
||||
},
|
||||
"group": {
|
||||
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||
"type": "string"
|
||||
},
|
||||
"disable-authentication": {
|
||||
"description": "Disable authentication (for everything: API, Web-UI, ...).",
|
||||
"type": "boolean"
|
||||
},
|
||||
"embed-static-files": {
|
||||
"description": "If all files in web/frontend/public should be served from within the binary itself (they are embedded) or not.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"static-files": {
|
||||
"description": "Folder where static assets can be found, if embed-static-files is false.",
|
||||
"type": "string"
|
||||
},
|
||||
"db": {
|
||||
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
|
||||
"type": "string"
|
||||
},
|
||||
"disable-archive": {
|
||||
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"enable-job-taggers": {
|
||||
"description": "Turn on automatic application and jobclass taggers",
|
||||
"type": "boolean"
|
||||
},
|
||||
"validate": {
|
||||
"description": "Validate all input json documents against json schema.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"session-max-age": {
|
||||
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
|
||||
"type": "string"
|
||||
},
|
||||
"https-cert-file": {
|
||||
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
|
||||
"type": "string"
|
||||
},
|
||||
"https-key-file": {
|
||||
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
|
||||
"type": "string"
|
||||
},
|
||||
"redirect-http-to": {
|
||||
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
|
||||
"type": "string"
|
||||
},
|
||||
"stop-jobs-exceeding-walltime": {
|
||||
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
|
||||
"type": "integer"
|
||||
},
|
||||
"short-running-jobs-duration": {
|
||||
"description": "Do not show running jobs shorter than X seconds.",
|
||||
"type": "integer"
|
||||
},
|
||||
"emission-constant": {
|
||||
"description": ".",
|
||||
"type": "integer"
|
||||
},
|
||||
"cron-frequency": {
|
||||
"description": "Frequency of cron job workers.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"duration-worker": {
|
||||
"description": "Duration Update Worker [Defaults to '5m']",
|
||||
"type": "string"
|
||||
},
|
||||
"footprint-worker": {
|
||||
"description": "Metric-Footprint Update Worker [Defaults to '10m']",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"enable-resampling": {
|
||||
"description": "Enable dynamic zoom in frontend metric plots.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"trigger": {
|
||||
"description": "Trigger next zoom level at less than this many visible datapoints.",
|
||||
"type": "integer"
|
||||
},
|
||||
"resolutions": {
|
||||
"description": "Array of resampling target resolutions, in seconds.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["trigger", "resolutions"]
|
||||
}
|
||||
},
|
||||
"required": ["apiAllowedIPs"]
|
||||
}`
|
||||
|
||||
var clustersSchema = `
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "The name of the cluster.",
|
||||
"type": "string"
|
||||
},
|
||||
"metricDataRepository": {
|
||||
"description": "Type of the metric data repository for this cluster",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
|
||||
},
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"token": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["kind"]
|
||||
},
|
||||
"filterRanges": {
|
||||
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"numNodes": {
|
||||
"description": "UI slider range for number of nodes",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
},
|
||||
"duration": {
|
||||
"description": "UI slider range for duration",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
},
|
||||
"startTime": {
|
||||
"description": "UI slider range for start time",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"to": {
|
||||
"type": "null"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
}
|
||||
},
|
||||
"required": ["numNodes", "duration", "startTime"]
|
||||
}
|
||||
},
|
||||
"required": ["name", "metricDataRepository", "filterRanges"],
|
||||
"minItems": 1
|
||||
}
|
||||
}`
|
||||
29
internal/config/validate.go
Normal file
29
internal/config/validate.go
Normal file
@@ -0,0 +1,29 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package config
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/santhosh-tekuri/jsonschema/v5"
|
||||
)
|
||||
|
||||
func Validate(schema string, instance json.RawMessage) {
|
||||
sch, err := jsonschema.CompileString("schema.json", schema)
|
||||
if err != nil {
|
||||
cclog.Fatalf("%#v", err)
|
||||
}
|
||||
|
||||
var v any
|
||||
if err := json.Unmarshal([]byte(instance), &v); err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
|
||||
if err = sch.Validate(v); err != nil {
|
||||
cclog.Fatalf("%#v", err)
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,6 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package model
|
||||
|
||||
@@ -3,12 +3,14 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
type Count struct {
|
||||
@@ -16,11 +18,23 @@ type Count struct {
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
type EnergyFootprintValue struct {
|
||||
Hardware string `json:"hardware"`
|
||||
Metric string `json:"metric"`
|
||||
Value float64 `json:"value"`
|
||||
}
|
||||
|
||||
type FloatRange struct {
|
||||
From float64 `json:"from"`
|
||||
To float64 `json:"to"`
|
||||
}
|
||||
|
||||
type FootprintValue struct {
|
||||
Name string `json:"name"`
|
||||
Stat string `json:"stat"`
|
||||
Value float64 `json:"value"`
|
||||
}
|
||||
|
||||
type Footprints struct {
|
||||
TimeWeights *TimeWeights `json:"timeWeights"`
|
||||
Metrics []*MetricFootprints `json:"metrics"`
|
||||
@@ -38,6 +52,7 @@ type IntRangeOutput struct {
|
||||
|
||||
type JobFilter struct {
|
||||
Tags []string `json:"tags,omitempty"`
|
||||
DbID []string `json:"dbId,omitempty"`
|
||||
JobID *StringInput `json:"jobId,omitempty"`
|
||||
ArrayJobID *int `json:"arrayJobId,omitempty"`
|
||||
User *StringInput `json:"user,omitempty"`
|
||||
@@ -45,18 +60,16 @@ type JobFilter struct {
|
||||
JobName *StringInput `json:"jobName,omitempty"`
|
||||
Cluster *StringInput `json:"cluster,omitempty"`
|
||||
Partition *StringInput `json:"partition,omitempty"`
|
||||
Duration *schema.IntRange `json:"duration,omitempty"`
|
||||
Duration *config.IntRange `json:"duration,omitempty"`
|
||||
Energy *FloatRange `json:"energy,omitempty"`
|
||||
MinRunningFor *int `json:"minRunningFor,omitempty"`
|
||||
NumNodes *schema.IntRange `json:"numNodes,omitempty"`
|
||||
NumAccelerators *schema.IntRange `json:"numAccelerators,omitempty"`
|
||||
NumHWThreads *schema.IntRange `json:"numHWThreads,omitempty"`
|
||||
StartTime *schema.TimeRange `json:"startTime,omitempty"`
|
||||
NumNodes *config.IntRange `json:"numNodes,omitempty"`
|
||||
NumAccelerators *config.IntRange `json:"numAccelerators,omitempty"`
|
||||
NumHWThreads *config.IntRange `json:"numHWThreads,omitempty"`
|
||||
StartTime *config.TimeRange `json:"startTime,omitempty"`
|
||||
State []schema.JobState `json:"state,omitempty"`
|
||||
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg,omitempty"`
|
||||
MemBwAvg *FloatRange `json:"memBwAvg,omitempty"`
|
||||
LoadAvg *FloatRange `json:"loadAvg,omitempty"`
|
||||
MemUsedMax *FloatRange `json:"memUsedMax,omitempty"`
|
||||
Exclusive *int `json:"exclusive,omitempty"`
|
||||
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
|
||||
Shared *string `json:"shared,omitempty"`
|
||||
Node *StringInput `json:"node,omitempty"`
|
||||
}
|
||||
|
||||
@@ -85,9 +98,23 @@ type JobResultList struct {
|
||||
HasNextPage *bool `json:"hasNextPage,omitempty"`
|
||||
}
|
||||
|
||||
type JobStats struct {
|
||||
ID int `json:"id"`
|
||||
JobID string `json:"jobId"`
|
||||
StartTime int `json:"startTime"`
|
||||
Duration int `json:"duration"`
|
||||
Cluster string `json:"cluster"`
|
||||
SubCluster string `json:"subCluster"`
|
||||
NumNodes int `json:"numNodes"`
|
||||
NumHWThreads *int `json:"numHWThreads,omitempty"`
|
||||
NumAccelerators *int `json:"numAccelerators,omitempty"`
|
||||
Stats []*NamedStats `json:"stats"`
|
||||
}
|
||||
|
||||
type JobsStatistics struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
TotalUsers int `json:"totalUsers"`
|
||||
TotalJobs int `json:"totalJobs"`
|
||||
RunningJobs int `json:"runningJobs"`
|
||||
ShortJobs int `json:"shortJobs"`
|
||||
@@ -120,20 +147,73 @@ type MetricHistoPoint struct {
|
||||
type MetricHistoPoints struct {
|
||||
Metric string `json:"metric"`
|
||||
Unit string `json:"unit"`
|
||||
Stat *string `json:"stat,omitempty"`
|
||||
Data []*MetricHistoPoint `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
type MetricStatItem struct {
|
||||
MetricName string `json:"metricName"`
|
||||
Range *FloatRange `json:"range"`
|
||||
}
|
||||
|
||||
type Mutation struct {
|
||||
}
|
||||
|
||||
type NamedStats struct {
|
||||
Name string `json:"name"`
|
||||
Data *schema.MetricStatistics `json:"data"`
|
||||
}
|
||||
|
||||
type NamedStatsWithScope struct {
|
||||
Name string `json:"name"`
|
||||
Scope schema.MetricScope `json:"scope"`
|
||||
Stats []*ScopedStats `json:"stats"`
|
||||
}
|
||||
|
||||
type NodeFilter struct {
|
||||
Hostname *StringInput `json:"hostname,omitempty"`
|
||||
Cluster *StringInput `json:"cluster,omitempty"`
|
||||
Subcluster *StringInput `json:"subcluster,omitempty"`
|
||||
SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"`
|
||||
HealthState *string `json:"healthState,omitempty"`
|
||||
TimeStart *int `json:"timeStart,omitempty"`
|
||||
}
|
||||
|
||||
type NodeMetrics struct {
|
||||
Host string `json:"host"`
|
||||
SubCluster string `json:"subCluster"`
|
||||
Metrics []*JobMetricWithName `json:"metrics"`
|
||||
}
|
||||
|
||||
type NodeStateResultList struct {
|
||||
Items []*schema.Node `json:"items"`
|
||||
Count *int `json:"count,omitempty"`
|
||||
}
|
||||
|
||||
type NodeStates struct {
|
||||
State string `json:"state"`
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
type NodeStatesTimed struct {
|
||||
State string `json:"state"`
|
||||
Type string `json:"type"`
|
||||
Count int `json:"count"`
|
||||
Time int `json:"time"`
|
||||
}
|
||||
|
||||
type NodesResultList struct {
|
||||
Items []*NodeMetrics `json:"items"`
|
||||
Offset *int `json:"offset,omitempty"`
|
||||
Limit *int `json:"limit,omitempty"`
|
||||
Count *int `json:"count,omitempty"`
|
||||
TotalNodes *int `json:"totalNodes,omitempty"`
|
||||
HasNextPage *bool `json:"hasNextPage,omitempty"`
|
||||
}
|
||||
|
||||
type OrderByInput struct {
|
||||
Field string `json:"field"`
|
||||
Type string `json:"type"`
|
||||
Order SortDirectionEnum `json:"order"`
|
||||
}
|
||||
|
||||
@@ -142,7 +222,10 @@ type PageRequest struct {
|
||||
Page int `json:"page"`
|
||||
}
|
||||
|
||||
type Query struct {
|
||||
type ScopedStats struct {
|
||||
Hostname string `json:"hostname"`
|
||||
ID *string `json:"id,omitempty"`
|
||||
Data *schema.MetricStatistics `json:"data"`
|
||||
}
|
||||
|
||||
type StringInput struct {
|
||||
@@ -155,8 +238,9 @@ type StringInput struct {
|
||||
}
|
||||
|
||||
type TimeRangeOutput struct {
|
||||
From time.Time `json:"from"`
|
||||
To time.Time `json:"to"`
|
||||
Range *string `json:"range,omitempty"`
|
||||
From time.Time `json:"from"`
|
||||
To time.Time `json:"to"`
|
||||
}
|
||||
|
||||
type TimeWeights struct {
|
||||
@@ -174,20 +258,22 @@ type User struct {
|
||||
type Aggregate string
|
||||
|
||||
const (
|
||||
AggregateUser Aggregate = "USER"
|
||||
AggregateProject Aggregate = "PROJECT"
|
||||
AggregateCluster Aggregate = "CLUSTER"
|
||||
AggregateUser Aggregate = "USER"
|
||||
AggregateProject Aggregate = "PROJECT"
|
||||
AggregateCluster Aggregate = "CLUSTER"
|
||||
AggregateSubcluster Aggregate = "SUBCLUSTER"
|
||||
)
|
||||
|
||||
var AllAggregate = []Aggregate{
|
||||
AggregateUser,
|
||||
AggregateProject,
|
||||
AggregateCluster,
|
||||
AggregateSubcluster,
|
||||
}
|
||||
|
||||
func (e Aggregate) IsValid() bool {
|
||||
switch e {
|
||||
case AggregateUser, AggregateProject, AggregateCluster:
|
||||
case AggregateUser, AggregateProject, AggregateCluster, AggregateSubcluster:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
@@ -197,7 +283,7 @@ func (e Aggregate) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *Aggregate) UnmarshalGQL(v interface{}) error {
|
||||
func (e *Aggregate) UnmarshalGQL(v any) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
@@ -214,11 +300,26 @@ func (e Aggregate) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
func (e *Aggregate) UnmarshalJSON(b []byte) error {
|
||||
s, err := strconv.Unquote(string(b))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return e.UnmarshalGQL(s)
|
||||
}
|
||||
|
||||
func (e Aggregate) MarshalJSON() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
e.MarshalGQL(&buf)
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
type SortByAggregate string
|
||||
|
||||
const (
|
||||
SortByAggregateTotalwalltime SortByAggregate = "TOTALWALLTIME"
|
||||
SortByAggregateTotaljobs SortByAggregate = "TOTALJOBS"
|
||||
SortByAggregateTotalusers SortByAggregate = "TOTALUSERS"
|
||||
SortByAggregateTotalnodes SortByAggregate = "TOTALNODES"
|
||||
SortByAggregateTotalnodehours SortByAggregate = "TOTALNODEHOURS"
|
||||
SortByAggregateTotalcores SortByAggregate = "TOTALCORES"
|
||||
@@ -230,6 +331,7 @@ const (
|
||||
var AllSortByAggregate = []SortByAggregate{
|
||||
SortByAggregateTotalwalltime,
|
||||
SortByAggregateTotaljobs,
|
||||
SortByAggregateTotalusers,
|
||||
SortByAggregateTotalnodes,
|
||||
SortByAggregateTotalnodehours,
|
||||
SortByAggregateTotalcores,
|
||||
@@ -240,7 +342,7 @@ var AllSortByAggregate = []SortByAggregate{
|
||||
|
||||
func (e SortByAggregate) IsValid() bool {
|
||||
switch e {
|
||||
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
|
||||
case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalusers, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
@@ -250,7 +352,7 @@ func (e SortByAggregate) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *SortByAggregate) UnmarshalGQL(v interface{}) error {
|
||||
func (e *SortByAggregate) UnmarshalGQL(v any) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
@@ -267,6 +369,20 @@ func (e SortByAggregate) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
func (e *SortByAggregate) UnmarshalJSON(b []byte) error {
|
||||
s, err := strconv.Unquote(string(b))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return e.UnmarshalGQL(s)
|
||||
}
|
||||
|
||||
func (e SortByAggregate) MarshalJSON() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
e.MarshalGQL(&buf)
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
type SortDirectionEnum string
|
||||
|
||||
const (
|
||||
@@ -291,7 +407,7 @@ func (e SortDirectionEnum) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
|
||||
func (e *SortDirectionEnum) UnmarshalGQL(v any) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
@@ -307,3 +423,17 @@ func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
|
||||
func (e SortDirectionEnum) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
func (e *SortDirectionEnum) UnmarshalJSON(b []byte) error {
|
||||
s, err := strconv.Unquote(string(b))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return e.UnmarshalGQL(s)
|
||||
}
|
||||
|
||||
func (e SortDirectionEnum) MarshalJSON() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
e.MarshalGQL(&buf)
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
@@ -1,15 +1,39 @@
|
||||
package graph
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
// This file will not be regenerated automatically.
|
||||
//
|
||||
// It serves as dependency injection for your app, add any dependencies you require here.
|
||||
var (
|
||||
initOnce sync.Once
|
||||
resolverInstance *Resolver
|
||||
)
|
||||
|
||||
type Resolver struct {
|
||||
DB *sqlx.DB
|
||||
Repo *repository.JobRepository
|
||||
}
|
||||
|
||||
func Init() {
|
||||
initOnce.Do(func() {
|
||||
db := repository.GetConnection()
|
||||
resolverInstance = &Resolver{
|
||||
DB: db.DB, Repo: repository.GetJobRepository(),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func GetResolverInstance() *Resolver {
|
||||
if resolverInstance == nil {
|
||||
cclog.Fatal("Authentication module not initialized!")
|
||||
}
|
||||
|
||||
return resolverInstance
|
||||
}
|
||||
|
||||
@@ -2,23 +2,26 @@ package graph
|
||||
|
||||
// This file will be automatically regenerated based on the schema, any resolver implementations
|
||||
// will be copied through when generating and any unknown code will be moved to the end.
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.45
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.81
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// Partitions is the resolver for the partitions field.
|
||||
@@ -26,26 +29,93 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) (
|
||||
return r.Repo.Partitions(obj.Name)
|
||||
}
|
||||
|
||||
// StartTime is the resolver for the startTime field.
|
||||
func (r *jobResolver) StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) {
|
||||
timestamp := time.Unix(obj.StartTime, 0)
|
||||
return ×tamp, nil
|
||||
}
|
||||
|
||||
// Tags is the resolver for the tags field.
|
||||
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
|
||||
return r.Repo.GetTags(&obj.ID)
|
||||
return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID)
|
||||
}
|
||||
|
||||
// ConcurrentJobs is the resolver for the concurrentJobs field.
|
||||
func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) {
|
||||
if obj.State == schema.JobStateRunning {
|
||||
obj.Duration = int32(time.Now().Unix() - obj.StartTimeUnix)
|
||||
}
|
||||
|
||||
if obj.Exclusive != 1 && obj.Duration > 600 {
|
||||
// FIXME: Make the hardcoded duration configurable
|
||||
if obj.Shared != "none" && obj.Duration > 600 {
|
||||
return r.Repo.FindConcurrentJobs(ctx, obj)
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Footprint is the resolver for the footprint field.
|
||||
func (r *jobResolver) Footprint(ctx context.Context, obj *schema.Job) ([]*model.FootprintValue, error) {
|
||||
rawFootprint, err := r.Repo.FetchFootprint(obj)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while fetching job footprint data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.FootprintValue{}
|
||||
for name, value := range rawFootprint {
|
||||
|
||||
parts := strings.Split(name, "_")
|
||||
statPart := parts[len(parts)-1]
|
||||
nameParts := parts[:len(parts)-1]
|
||||
|
||||
res = append(res, &model.FootprintValue{
|
||||
Name: strings.Join(nameParts, "_"),
|
||||
Stat: statPart,
|
||||
Value: value,
|
||||
})
|
||||
}
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
// EnergyFootprint is the resolver for the energyFootprint field.
|
||||
func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*model.EnergyFootprintValue, error) {
|
||||
rawEnergyFootprint, err := r.Repo.FetchEnergyFootprint(obj)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while fetching job energy footprint data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.EnergyFootprintValue{}
|
||||
for name, value := range rawEnergyFootprint {
|
||||
// Suboptimal: Nearly hardcoded metric name expectations
|
||||
matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`)
|
||||
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
|
||||
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
|
||||
matchCore := regexp.MustCompile(`core|Core|CORE`)
|
||||
|
||||
hwType := ""
|
||||
switch test := name; { // NOtice ';' for var declaration
|
||||
case matchCpu.MatchString(test):
|
||||
hwType = "CPU"
|
||||
case matchAcc.MatchString(test):
|
||||
hwType = "Accelerator"
|
||||
case matchMem.MatchString(test):
|
||||
hwType = "Memory"
|
||||
case matchCore.MatchString(test):
|
||||
hwType = "Core"
|
||||
default:
|
||||
hwType = "Other"
|
||||
}
|
||||
|
||||
res = append(res, &model.EnergyFootprintValue{
|
||||
Hardware: hwType,
|
||||
Metric: name,
|
||||
Value: value,
|
||||
})
|
||||
}
|
||||
return res, err
|
||||
}
|
||||
|
||||
// MetaData is the resolver for the metaData field.
|
||||
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) {
|
||||
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (any, error) {
|
||||
return r.Repo.FetchMetadata(obj)
|
||||
}
|
||||
|
||||
@@ -54,41 +124,82 @@ func (r *jobResolver) UserData(ctx context.Context, obj *schema.Job) (*model.Use
|
||||
return repository.GetUserRepository().FetchUserInCtx(ctx, obj.User)
|
||||
}
|
||||
|
||||
// Name is the resolver for the name field.
|
||||
func (r *metricValueResolver) Name(ctx context.Context, obj *schema.MetricValue) (*string, error) {
|
||||
panic(fmt.Errorf("not implemented: Name - name"))
|
||||
}
|
||||
|
||||
// CreateTag is the resolver for the createTag field.
|
||||
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
|
||||
id, err := r.Repo.CreateTag(typeArg, name)
|
||||
if err != nil {
|
||||
log.Warn("Error while creating tag")
|
||||
return nil, err
|
||||
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string, scope string) (*schema.Tag, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user == nil {
|
||||
return nil, fmt.Errorf("no user in context")
|
||||
}
|
||||
|
||||
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
|
||||
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && scope == "admin" ||
|
||||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && scope == "global" ||
|
||||
user.Username == scope {
|
||||
// Create in DB
|
||||
id, err := r.Repo.CreateTag(typeArg, name, scope)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while creating tag")
|
||||
return nil, err
|
||||
}
|
||||
return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil
|
||||
} else {
|
||||
cclog.Warnf("Not authorized to create tag with scope: %s", scope)
|
||||
return nil, fmt.Errorf("not authorized to create tag with scope: %s", scope)
|
||||
}
|
||||
}
|
||||
|
||||
// DeleteTag is the resolver for the deleteTag field.
|
||||
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
|
||||
// This Uses ID string <-> ID string, removeTagFromList uses []string <-> []int
|
||||
panic(fmt.Errorf("not implemented: DeleteTag - deleteTag"))
|
||||
}
|
||||
|
||||
// AddTagsToJob is the resolver for the addTagsToJob field.
|
||||
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user == nil {
|
||||
return nil, fmt.Errorf("no user in context")
|
||||
}
|
||||
|
||||
jid, err := strconv.ParseInt(job, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while adding tag to job")
|
||||
cclog.Warn("Error while adding tag to job")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing tag id")
|
||||
cclog.Warn("Error while parsing tag id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if tags, err = r.Repo.AddTag(jid, tid); err != nil {
|
||||
log.Warn("Error while adding tag")
|
||||
return nil, err
|
||||
// Test Exists
|
||||
_, _, tscope, exists := r.Repo.TagInfo(tid)
|
||||
if !exists {
|
||||
cclog.Warnf("Tag does not exist (ID): %d", tid)
|
||||
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
|
||||
}
|
||||
|
||||
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
|
||||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
|
||||
user.Username == tscope {
|
||||
// Add to Job
|
||||
if tags, err = r.Repo.AddTag(user, jid, tid); err != nil {
|
||||
cclog.Warn("Error while adding tag")
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Not authorized to add tag: %d", tid)
|
||||
return nil, fmt.Errorf("not authorized to add tag: %d", tid)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,39 +208,123 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
|
||||
|
||||
// RemoveTagsFromJob is the resolver for the removeTagsFromJob field.
|
||||
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user == nil {
|
||||
return nil, fmt.Errorf("no user in context")
|
||||
}
|
||||
|
||||
jid, err := strconv.ParseInt(job, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing job id")
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing tag id")
|
||||
cclog.Warn("Error while parsing tag id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if tags, err = r.Repo.RemoveTag(jid, tid); err != nil {
|
||||
log.Warn("Error while removing tag")
|
||||
return nil, err
|
||||
// Test Exists
|
||||
_, _, tscope, exists := r.Repo.TagInfo(tid)
|
||||
if !exists {
|
||||
cclog.Warnf("Tag does not exist (ID): %d", tid)
|
||||
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
|
||||
}
|
||||
|
||||
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
|
||||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
|
||||
user.Username == tscope {
|
||||
// Remove from Job
|
||||
if tags, err = r.Repo.RemoveTag(user, jid, tid); err != nil {
|
||||
cclog.Warn("Error while removing tag")
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Not authorized to remove tag: %d", tid)
|
||||
return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
// RemoveTagFromList is the resolver for the removeTagFromList field.
|
||||
func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []string) ([]int, error) {
|
||||
// Needs Contextuser
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user == nil {
|
||||
return nil, fmt.Errorf("no user in context")
|
||||
}
|
||||
|
||||
tags := []int{}
|
||||
for _, tagId := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing tag id for removal")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Test Exists
|
||||
_, _, tscope, exists := r.Repo.TagInfo(tid)
|
||||
if !exists {
|
||||
cclog.Warnf("Tag does not exist (ID): %d", tid)
|
||||
return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
|
||||
}
|
||||
|
||||
// Test Access: Admins && Admin Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
|
||||
// Remove from DB
|
||||
if err = r.Repo.RemoveTagById(tid); err != nil {
|
||||
cclog.Warn("Error while removing tag")
|
||||
return nil, err
|
||||
} else {
|
||||
tags = append(tags, int(tid))
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Not authorized to remove tag: %d", tid)
|
||||
return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
|
||||
}
|
||||
}
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
// UpdateConfiguration is the resolver for the updateConfiguration field.
|
||||
func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string, value string) (*string, error) {
|
||||
if err := repository.GetUserCfgRepo().UpdateConfig(name, value, repository.GetUserFromContext(ctx)); err != nil {
|
||||
log.Warn("Error while updating user config")
|
||||
cclog.Warn("Error while updating user config")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// ID is the resolver for the id field.
|
||||
func (r *nodeResolver) ID(ctx context.Context, obj *schema.Node) (string, error) {
|
||||
panic(fmt.Errorf("not implemented: ID - id"))
|
||||
}
|
||||
|
||||
// SchedulerState is the resolver for the schedulerState field.
|
||||
func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (schema.SchedulerState, error) {
|
||||
panic(fmt.Errorf("not implemented: SchedulerState - schedulerState"))
|
||||
}
|
||||
|
||||
// HealthState is the resolver for the healthState field.
|
||||
func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (string, error) {
|
||||
panic(fmt.Errorf("not implemented: HealthState - healthState"))
|
||||
}
|
||||
|
||||
// MetaData is the resolver for the metaData field.
|
||||
func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) {
|
||||
panic(fmt.Errorf("not implemented: MetaData - metaData"))
|
||||
}
|
||||
|
||||
// Clusters is the resolver for the clusters field.
|
||||
func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error) {
|
||||
return archive.Clusters, nil
|
||||
@@ -137,7 +332,12 @@ func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error)
|
||||
|
||||
// Tags is the resolver for the tags field.
|
||||
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
|
||||
return r.Repo.GetTags(nil)
|
||||
return r.Repo.GetTags(repository.GetUserFromContext(ctx), nil)
|
||||
}
|
||||
|
||||
// GlobalMetrics is the resolver for the globalMetrics field.
|
||||
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
|
||||
return archive.GlobalMetricList, nil
|
||||
}
|
||||
|
||||
// User is the resolver for the user field.
|
||||
@@ -149,7 +349,7 @@ func (r *queryResolver) User(ctx context.Context, username string) (*model.User,
|
||||
func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*model.Count, error) {
|
||||
data, err := r.Repo.AllocatedNodes(cluster)
|
||||
if err != nil {
|
||||
log.Warn("Error while fetching allocated nodes")
|
||||
cclog.Warn("Error while fetching allocated nodes")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -164,17 +364,81 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
|
||||
return counts, nil
|
||||
}
|
||||
|
||||
// Node is the resolver for the node field.
|
||||
func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
numericId, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
return repo.GetNodeById(numericId, false)
|
||||
}
|
||||
|
||||
// Nodes is the resolver for the nodes field.
|
||||
func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
nodes, err := repo.QueryNodes(ctx, filter, order)
|
||||
count := len(nodes)
|
||||
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
|
||||
}
|
||||
|
||||
// NodeStates is the resolver for the nodeStates field.
|
||||
func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
|
||||
stateCounts, serr := repo.CountNodeStates(ctx, filter)
|
||||
if serr != nil {
|
||||
cclog.Warnf("Error while counting nodeStates: %s", serr.Error())
|
||||
return nil, serr
|
||||
}
|
||||
|
||||
healthCounts, herr := repo.CountHealthStates(ctx, filter)
|
||||
if herr != nil {
|
||||
cclog.Warnf("Error while counting healthStates: %s", herr.Error())
|
||||
return nil, herr
|
||||
}
|
||||
|
||||
allCounts := make([]*model.NodeStates, 0)
|
||||
allCounts = append(stateCounts, healthCounts...)
|
||||
|
||||
return allCounts, nil
|
||||
}
|
||||
|
||||
// NodeStatesTimed is the resolver for the nodeStatesTimed field.
|
||||
func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStatesTimed, error) {
|
||||
panic(fmt.Errorf("not implemented: NodeStatesTimed - NodeStatesTimed"))
|
||||
// repo := repository.GetNodeRepository()
|
||||
|
||||
// stateCounts, serr := repo.CountNodeStates(ctx, filter)
|
||||
// if serr != nil {
|
||||
// cclog.Warnf("Error while counting nodeStates: %s", serr.Error())
|
||||
// return nil, serr
|
||||
// }
|
||||
|
||||
// healthCounts, herr := repo.CountHealthStates(ctx, filter)
|
||||
// if herr != nil {
|
||||
// cclog.Warnf("Error while counting healthStates: %s", herr.Error())
|
||||
// return nil, herr
|
||||
// }
|
||||
|
||||
// allCounts := make([]*model.NodeStates, 0)
|
||||
// allCounts = append(stateCounts, healthCounts...)
|
||||
|
||||
// return allCounts, nil
|
||||
}
|
||||
|
||||
// Job is the resolver for the job field.
|
||||
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
|
||||
numericId, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing job id")
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job, err := r.Repo.FindById(numericId)
|
||||
job, err := r.Repo.FindById(ctx, numericId)
|
||||
if err != nil {
|
||||
log.Warn("Error while finding job by id")
|
||||
cclog.Warn("Error while finding job by id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -188,16 +452,26 @@ func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error)
|
||||
}
|
||||
|
||||
// JobMetrics is the resolver for the jobMetrics field.
|
||||
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.JobMetricWithName, error) {
|
||||
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope, resolution *int) ([]*model.JobMetricWithName, error) {
|
||||
if resolution == nil { // Load from Config
|
||||
if config.Keys.EnableResampling != nil {
|
||||
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
|
||||
resolution = &defaultRes
|
||||
} else { // Set 0 (Loads configured metric timestep)
|
||||
defaultRes := 0
|
||||
resolution = &defaultRes
|
||||
}
|
||||
}
|
||||
|
||||
job, err := r.Query().Job(ctx, id)
|
||||
if err != nil {
|
||||
log.Warn("Error while querying job for metrics")
|
||||
cclog.Warn("Error while querying job for metrics")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricdata.LoadData(job, metrics, scopes, ctx)
|
||||
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading job data")
|
||||
cclog.Warn("Error while loading job data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -215,9 +489,67 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
|
||||
return res, err
|
||||
}
|
||||
|
||||
// JobsFootprints is the resolver for the jobsFootprints field.
|
||||
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||
return r.jobsFootprints(ctx, filter, metrics)
|
||||
// JobStats is the resolver for the jobStats field.
|
||||
func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []string) ([]*model.NamedStats, error) {
|
||||
job, err := r.Query().Job(ctx, id)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while querying job %s for metadata", id)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading jobStats data for job id %s", id)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.NamedStats{}
|
||||
for name, md := range data {
|
||||
res = append(res, &model.NamedStats{
|
||||
Name: name,
|
||||
Data: &md,
|
||||
})
|
||||
}
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
// ScopedJobStats is the resolver for the scopedJobStats field.
|
||||
func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.NamedStatsWithScope, error) {
|
||||
job, err := r.Query().Job(ctx, id)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while querying job %s for metadata", id)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadScopedJobStats(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := make([]*model.NamedStatsWithScope, 0)
|
||||
for name, scoped := range data {
|
||||
for scope, stats := range scoped {
|
||||
|
||||
mdlStats := make([]*model.ScopedStats, 0)
|
||||
for _, stat := range stats {
|
||||
mdlStats = append(mdlStats, &model.ScopedStats{
|
||||
Hostname: stat.Hostname,
|
||||
ID: stat.Id,
|
||||
Data: stat.Data,
|
||||
})
|
||||
}
|
||||
|
||||
res = append(res, &model.NamedStatsWithScope{
|
||||
Name: name,
|
||||
Scope: scope,
|
||||
Stats: mdlStats,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// Jobs is the resolver for the jobs field.
|
||||
@@ -231,48 +563,47 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
|
||||
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, page, order)
|
||||
if err != nil {
|
||||
log.Warn("Error while querying jobs")
|
||||
cclog.Warn("Error while querying jobs")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
count, err := r.Repo.CountJobs(ctx, filter)
|
||||
if err != nil {
|
||||
log.Warn("Error while counting jobs")
|
||||
cclog.Warn("Error while counting jobs")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !config.Keys.UiDefaults["job_list_usePaging"].(bool) {
|
||||
hasNextPage := false
|
||||
// page.Page += 1 : Simple, but expensive
|
||||
// Example Page 4 @ 10 IpP : Does item 41 exist?
|
||||
// Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
|
||||
nextPage := &model.PageRequest{
|
||||
ItemsPerPage: 1,
|
||||
Page: ((page.Page * page.ItemsPerPage) + 1),
|
||||
}
|
||||
|
||||
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
|
||||
if err != nil {
|
||||
log.Warn("Error while querying next jobs")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(nextJobs) == 1 {
|
||||
hasNextPage = true
|
||||
}
|
||||
|
||||
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
|
||||
} else {
|
||||
return &model.JobResultList{Items: jobs, Count: &count}, nil
|
||||
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
|
||||
// Users can decide in frontend to use continuous scroll, even if app-default is paging!
|
||||
/*
|
||||
Example Page 4 @ 10 IpP : Does item 41 exist?
|
||||
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
|
||||
*/
|
||||
nextPage := &model.PageRequest{
|
||||
ItemsPerPage: 1,
|
||||
Page: ((page.Page * page.ItemsPerPage) + 1),
|
||||
}
|
||||
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying next jobs")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
hasNextPage := len(nextJobs) == 1
|
||||
|
||||
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
|
||||
}
|
||||
|
||||
// JobsStatistics is the resolver for the jobsStatistics field.
|
||||
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
|
||||
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate, numDurationBins *string, numMetricBins *int) ([]*model.JobsStatistics, error) {
|
||||
var err error
|
||||
var stats []*model.JobsStatistics
|
||||
|
||||
if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
|
||||
// Top Level Defaults
|
||||
defaultDurationBins := "1h"
|
||||
defaultMetricBins := 10
|
||||
|
||||
if requireField(ctx, "totalJobs") || requireField(ctx, "totalUsers") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
|
||||
requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") {
|
||||
if groupBy == nil {
|
||||
stats, err = r.Repo.JobsStats(ctx, filter)
|
||||
@@ -305,8 +636,13 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
}
|
||||
|
||||
if requireField(ctx, "histDuration") || requireField(ctx, "histNumNodes") || requireField(ctx, "histNumCores") || requireField(ctx, "histNumAccs") {
|
||||
|
||||
if numDurationBins == nil {
|
||||
numDurationBins = &defaultDurationBins
|
||||
}
|
||||
|
||||
if groupBy == nil {
|
||||
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0])
|
||||
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0], numDurationBins)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -316,8 +652,13 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
}
|
||||
|
||||
if requireField(ctx, "histMetrics") {
|
||||
|
||||
if numMetricBins == nil {
|
||||
numMetricBins = &defaultMetricBins
|
||||
}
|
||||
|
||||
if groupBy == nil {
|
||||
stats[0], err = r.Repo.AddMetricHistograms(ctx, filter, metrics, stats[0])
|
||||
stats[0], err = r.Repo.AddMetricHistograms(ctx, filter, metrics, stats[0], numMetricBins)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -329,6 +670,62 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// JobsMetricStats is the resolver for the jobsMetricStats field.
|
||||
func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.JobFilter, metrics []string) ([]*model.JobStats, error) {
|
||||
// No Paging, Fixed Order by StartTime ASC
|
||||
order := &model.OrderByInput{
|
||||
Field: "startTime",
|
||||
Type: "col",
|
||||
Order: "ASC",
|
||||
}
|
||||
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, nil, order)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying jobs for comparison")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.JobStats{}
|
||||
for _, job := range jobs {
|
||||
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
|
||||
continue
|
||||
// return nil, err
|
||||
}
|
||||
|
||||
sres := []*model.NamedStats{}
|
||||
for name, md := range data {
|
||||
sres = append(sres, &model.NamedStats{
|
||||
Name: name,
|
||||
Data: &md,
|
||||
})
|
||||
}
|
||||
|
||||
numThreadsInt := int(job.NumHWThreads)
|
||||
numAccsInt := int(job.NumAcc)
|
||||
res = append(res, &model.JobStats{
|
||||
ID: int(*job.ID),
|
||||
JobID: strconv.Itoa(int(job.JobID)),
|
||||
StartTime: int(job.StartTime),
|
||||
Duration: int(job.Duration),
|
||||
Cluster: job.Cluster,
|
||||
SubCluster: job.SubCluster,
|
||||
NumNodes: int(job.NumNodes),
|
||||
NumHWThreads: &numThreadsInt,
|
||||
NumAccelerators: &numAccsInt,
|
||||
Stats: sres,
|
||||
})
|
||||
}
|
||||
return res, err
|
||||
}
|
||||
|
||||
// JobsFootprints is the resolver for the jobsFootprints field.
|
||||
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||
// NOTE: Legacy Naming! This resolver is for normalized histograms in analysis view only - *Not* related to DB "footprint" column!
|
||||
return r.jobsFootprints(ctx, filter, metrics)
|
||||
}
|
||||
|
||||
// RooflineHeatmap is the resolver for the rooflineHeatmap field.
|
||||
func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
|
||||
return r.rooflineHeatmap(ctx, filter, rows, cols, minX, minY, maxX, maxY)
|
||||
@@ -337,8 +734,8 @@ func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.Job
|
||||
// NodeMetrics is the resolver for the nodeMetrics field.
|
||||
func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) ([]*model.NodeMetrics, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user != nil && !user.HasRole(schema.RoleAdmin) {
|
||||
return nil, errors.New("you need to be an administrator for this query")
|
||||
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
@@ -347,9 +744,9 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
}
|
||||
}
|
||||
|
||||
data, err := metricdata.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading node data")
|
||||
cclog.Warn("error while loading node data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -359,7 +756,10 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
Host: hostname,
|
||||
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
|
||||
}
|
||||
host.SubCluster, _ = archive.GetSubClusterByNode(cluster, hostname)
|
||||
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
|
||||
if err != nil {
|
||||
cclog.Warnf("error in nodeMetrics resolver: %s", err)
|
||||
}
|
||||
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for _, scopedMetric := range scopedMetrics {
|
||||
@@ -377,6 +777,68 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
return nodeMetrics, nil
|
||||
}
|
||||
|
||||
// NodeMetricsList is the resolver for the nodeMetricsList field.
|
||||
func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, subCluster string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) (*model.NodesResultList, error) {
|
||||
if resolution == nil { // Load from Config
|
||||
if config.Keys.EnableResampling != nil {
|
||||
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
|
||||
resolution = &defaultRes
|
||||
} else { // Set 0 (Loads configured metric timestep)
|
||||
defaultRes := 0
|
||||
resolution = &defaultRes
|
||||
}
|
||||
}
|
||||
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, totalNodes, hasNextPage, err := metricDataDispatcher.LoadNodeListData(cluster, subCluster, nodeFilter, metrics, scopes, *resolution, from, to, page, ctx)
|
||||
if err != nil {
|
||||
cclog.Warn("error while loading node data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
|
||||
for hostname, metrics := range data {
|
||||
host := &model.NodeMetrics{
|
||||
Host: hostname,
|
||||
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
|
||||
}
|
||||
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
|
||||
if err != nil {
|
||||
cclog.Warnf("error in nodeMetrics resolver: %s", err)
|
||||
}
|
||||
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for scope, scopedMetric := range scopedMetrics {
|
||||
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
|
||||
Name: metric,
|
||||
Scope: scope,
|
||||
Metric: scopedMetric,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
nodeMetricsList = append(nodeMetricsList, host)
|
||||
}
|
||||
|
||||
nodeMetricsListResult := &model.NodesResultList{
|
||||
Items: nodeMetricsList,
|
||||
TotalNodes: &totalNodes,
|
||||
HasNextPage: &hasNextPage,
|
||||
}
|
||||
|
||||
return nodeMetricsListResult, nil
|
||||
}
|
||||
|
||||
// NumberOfNodes is the resolver for the numberOfNodes field.
|
||||
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
|
||||
nodeList, err := archive.ParseNodeList(obj.Nodes)
|
||||
@@ -392,9 +854,15 @@ func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver
|
||||
// Job returns generated.JobResolver implementation.
|
||||
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
|
||||
|
||||
// MetricValue returns generated.MetricValueResolver implementation.
|
||||
func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricValueResolver{r} }
|
||||
|
||||
// Mutation returns generated.MutationResolver implementation.
|
||||
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
|
||||
|
||||
// Node returns generated.NodeResolver implementation.
|
||||
func (r *Resolver) Node() generated.NodeResolver { return &nodeResolver{r} }
|
||||
|
||||
// Query returns generated.QueryResolver implementation.
|
||||
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
|
||||
|
||||
@@ -403,6 +871,8 @@ func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subCluste
|
||||
|
||||
type clusterResolver struct{ *Resolver }
|
||||
type jobResolver struct{ *Resolver }
|
||||
type metricValueResolver struct{ *Resolver }
|
||||
type mutationResolver struct{ *Resolver }
|
||||
type nodeResolver struct{ *Resolver }
|
||||
type queryResolver struct{ *Resolver }
|
||||
type subClusterResolver struct{ *Resolver }
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package graph
|
||||
@@ -11,10 +11,9 @@ import (
|
||||
|
||||
"github.com/99designs/gqlgen/graphql"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
// "github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
const MAX_JOBS_FOR_ANALYSIS = 500
|
||||
@@ -24,11 +23,11 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
ctx context.Context,
|
||||
filter []*model.JobFilter,
|
||||
rows int, cols int,
|
||||
minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
|
||||
|
||||
minX float64, minY float64, maxX float64, maxY float64,
|
||||
) ([][]float64, error) {
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
||||
if err != nil {
|
||||
log.Error("Error while querying jobs for roofline")
|
||||
cclog.Error("Error while querying jobs for roofline")
|
||||
return nil, err
|
||||
}
|
||||
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
||||
@@ -47,15 +46,22 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
continue
|
||||
}
|
||||
|
||||
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
|
||||
// metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
// resolution := 0
|
||||
|
||||
// for _, mc := range metricConfigs {
|
||||
// resolution = max(resolution, mc.Timestep)
|
||||
// }
|
||||
|
||||
jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
|
||||
if err != nil {
|
||||
log.Errorf("Error while loading roofline metrics for job %d", job.ID)
|
||||
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
|
||||
if flops_ == nil && membw_ == nil {
|
||||
log.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
continue
|
||||
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
}
|
||||
@@ -63,7 +69,7 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
flops, ok1 := flops_["node"]
|
||||
membw, ok2 := membw_["node"]
|
||||
if !ok1 || !ok2 {
|
||||
log.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||
cclog.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||
continue
|
||||
// TODO/FIXME:
|
||||
// return nil, errors.New("GRAPH/UTIL > todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||
@@ -98,7 +104,7 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
||||
if err != nil {
|
||||
log.Error("Error while querying jobs for footprint")
|
||||
cclog.Error("Error while querying jobs for footprint")
|
||||
return nil, err
|
||||
}
|
||||
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
||||
@@ -120,8 +126,8 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
||||
continue
|
||||
}
|
||||
|
||||
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
log.Error("Error while loading averages for footprint")
|
||||
if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
cclog.Error("Error while loading averages for footprint")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package importer
|
||||
@@ -8,15 +8,15 @@ import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// Import all jobs specified as `<path-to-meta.json>:<path-to-data.json>,...`
|
||||
@@ -31,7 +31,7 @@ func HandleImportFlag(flag string) error {
|
||||
|
||||
raw, err := os.ReadFile(files[0])
|
||||
if err != nil {
|
||||
log.Warn("Error while reading metadata file for import")
|
||||
cclog.Warn("Error while reading metadata file for import")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -42,15 +42,18 @@ func HandleImportFlag(flag string) error {
|
||||
}
|
||||
dec := json.NewDecoder(bytes.NewReader(raw))
|
||||
dec.DisallowUnknownFields()
|
||||
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err = dec.Decode(&jobMeta); err != nil {
|
||||
log.Warn("Error while decoding raw json metadata for import")
|
||||
job := schema.Job{
|
||||
Shared: "none",
|
||||
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
|
||||
}
|
||||
if err = dec.Decode(&job); err != nil {
|
||||
cclog.Warn("Error while decoding raw json metadata for import")
|
||||
return err
|
||||
}
|
||||
|
||||
raw, err = os.ReadFile(files[1])
|
||||
if err != nil {
|
||||
log.Warn("Error while reading jobdata file for import")
|
||||
cclog.Warn("Error while reading jobdata file for import")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -63,72 +66,108 @@ func HandleImportFlag(flag string) error {
|
||||
dec.DisallowUnknownFields()
|
||||
jobData := schema.JobData{}
|
||||
if err = dec.Decode(&jobData); err != nil {
|
||||
log.Warn("Error while decoding raw json jobdata for import")
|
||||
cclog.Warn("Error while decoding raw json jobdata for import")
|
||||
return err
|
||||
}
|
||||
|
||||
// checkJobData(&jobData)
|
||||
job.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
|
||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
|
||||
// if _, err = r.Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
|
||||
// if err != nil {
|
||||
// log.Warn("Error while finding job in jobRepository")
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// return fmt.Errorf("REPOSITORY/INIT > a job with that jobId, cluster and startTime does already exist")
|
||||
// }
|
||||
//
|
||||
job := schema.Job{
|
||||
BaseJob: jobMeta.BaseJob,
|
||||
StartTime: time.Unix(jobMeta.StartTime, 0),
|
||||
StartTimeUnix: jobMeta.StartTime,
|
||||
sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
// TODO: Other metrics...
|
||||
job.LoadAvg = loadJobStat(&jobMeta, "cpu_load")
|
||||
job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any")
|
||||
job.MemUsedMax = loadJobStat(&jobMeta, "mem_used")
|
||||
job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw")
|
||||
job.NetBwAvg = loadJobStat(&jobMeta, "net_bw")
|
||||
job.FileBwAvg = loadJobStat(&jobMeta, "file_bw")
|
||||
job.Footprint = make(map[string]float64)
|
||||
|
||||
for _, fp := range sc.Footprint {
|
||||
statType := "avg"
|
||||
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
|
||||
statType = sc.MetricConfig[i].Footprint
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||
|
||||
job.Footprint[name] = repository.LoadJobStat(&job, fp, statType)
|
||||
}
|
||||
|
||||
job.RawFootprint, err = json.Marshal(job.Footprint)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while marshaling job footprint")
|
||||
return err
|
||||
}
|
||||
|
||||
job.EnergyFootprint = make(map[string]float64)
|
||||
|
||||
// Total Job Energy Outside Loop
|
||||
totalEnergy := 0.0
|
||||
for _, fp := range sc.EnergyFootprint {
|
||||
// Always Init Metric Energy Inside Loop
|
||||
metricEnergy := 0.0
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
|
||||
// Note: For DB data, calculate and save as kWh
|
||||
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules)
|
||||
cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", job.JobID, job.Cluster, fp)
|
||||
// FIXME: Needs sum as stats type
|
||||
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
|
||||
// Energy: Power (in Watts) * Time (in Seconds)
|
||||
// Unit: (W * (s / 3600)) / 1000 = kWh
|
||||
// Round 2 Digits: round(Energy * 100) / 100
|
||||
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
|
||||
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
|
||||
rawEnergy := ((repository.LoadJobStat(&job, fp, "avg") * float64(job.NumNodes)) * (float64(job.Duration) / 3600.0)) / 1000.0
|
||||
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
|
||||
}
|
||||
|
||||
job.EnergyFootprint[fp] = metricEnergy
|
||||
totalEnergy += metricEnergy
|
||||
}
|
||||
|
||||
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
|
||||
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
|
||||
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
log.Warn("Error while marshaling job resources")
|
||||
cclog.Warn("Error while marshaling job resources")
|
||||
return err
|
||||
}
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
log.Warn("Error while marshaling job metadata")
|
||||
cclog.Warn("Error while marshaling job metadata")
|
||||
return err
|
||||
}
|
||||
|
||||
if err = SanityChecks(&job.BaseJob); err != nil {
|
||||
log.Warn("BaseJob SanityChecks failed")
|
||||
if err = SanityChecks(&job); err != nil {
|
||||
cclog.Warn("BaseJob SanityChecks failed")
|
||||
return err
|
||||
}
|
||||
|
||||
if err = archive.GetHandle().ImportJob(&jobMeta, &jobData); err != nil {
|
||||
log.Error("Error while importing job")
|
||||
if err = archive.GetHandle().ImportJob(&job, &jobData); err != nil {
|
||||
cclog.Error("Error while importing job")
|
||||
return err
|
||||
}
|
||||
|
||||
id, err := r.InsertJob(&job)
|
||||
if err != nil {
|
||||
log.Warn("Error while job db insert")
|
||||
cclog.Warn("Error while job db insert")
|
||||
return err
|
||||
}
|
||||
|
||||
for _, tag := range job.Tags {
|
||||
if _, err := r.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
|
||||
log.Error("Error while adding or creating tag")
|
||||
if err := r.ImportTag(id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||
cclog.Error("Error while adding or creating tag on import")
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
|
||||
cclog.Infof("successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package importer_test
|
||||
@@ -16,7 +16,8 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func copyFile(s string, d string) error {
|
||||
@@ -36,15 +37,16 @@ func copyFile(s string, d string) error {
|
||||
|
||||
func setup(t *testing.T) *repository.JobRepository {
|
||||
const testconfig = `{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
]},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "testcluster",
|
||||
@@ -75,14 +77,14 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
}
|
||||
]}`
|
||||
|
||||
log.Init("info", true)
|
||||
cclog.Init("info", true)
|
||||
tmpdir := t.TempDir()
|
||||
|
||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||
if err := os.Mkdir(jobarchive, 0777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
fritzArchive := filepath.Join(tmpdir, "job-archive", "fritz")
|
||||
@@ -105,7 +107,19 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
config.Init(cfgFilePath)
|
||||
ccconf.Init(cfgFilePath)
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
t.Fatal("Cluster configuration must be present")
|
||||
}
|
||||
} else {
|
||||
t.Fatal("Main configuration must be present")
|
||||
}
|
||||
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
@@ -163,7 +177,7 @@ func TestHandleImportFlag(t *testing.T) {
|
||||
}
|
||||
|
||||
result := readResult(t, testname)
|
||||
job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime)
|
||||
job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package importer
|
||||
@@ -7,13 +7,19 @@ package importer
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
const (
|
||||
addTagQuery = "INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)"
|
||||
setTagQuery = "INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)"
|
||||
)
|
||||
|
||||
// Delete the tables "job", "tag" and "jobtag" from the database and
|
||||
@@ -21,20 +27,20 @@ import (
|
||||
func InitDB() error {
|
||||
r := repository.GetJobRepository()
|
||||
if err := r.Flush(); err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
return err
|
||||
}
|
||||
starttime := time.Now()
|
||||
log.Print("Building job table...")
|
||||
cclog.Print("Building job table...")
|
||||
|
||||
t, err := r.TransactionInit()
|
||||
if err != nil {
|
||||
log.Warn("Error while initializing SQL transactions")
|
||||
cclog.Warn("Error while initializing SQL transactions")
|
||||
return err
|
||||
}
|
||||
tags := make(map[string]int64)
|
||||
|
||||
// Not using log.Print because we want the line to end with `\r` and
|
||||
// Not using cclog.Print because we want the line to end with `\r` and
|
||||
// this function is only ever called when a special command line flag
|
||||
// is passed anyways.
|
||||
fmt.Printf("%d jobs inserted...\r", 0)
|
||||
@@ -54,61 +60,114 @@ func InitDB() error {
|
||||
}
|
||||
|
||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
job := schema.Job{
|
||||
BaseJob: jobMeta.BaseJob,
|
||||
StartTime: time.Unix(jobMeta.StartTime, 0),
|
||||
StartTimeUnix: jobMeta.StartTime,
|
||||
}
|
||||
|
||||
// TODO: Other metrics...
|
||||
job.LoadAvg = loadJobStat(jobMeta, "cpu_load")
|
||||
job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any")
|
||||
job.MemUsedMax = loadJobStat(jobMeta, "mem_used")
|
||||
job.MemBwAvg = loadJobStat(jobMeta, "mem_bw")
|
||||
job.NetBwAvg = loadJobStat(jobMeta, "net_bw")
|
||||
job.FileBwAvg = loadJobStat(jobMeta, "file_bw")
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
|
||||
if err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
cclog.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
jobMeta.Footprint = make(map[string]float64)
|
||||
|
||||
for _, fp := range sc.Footprint {
|
||||
statType := "avg"
|
||||
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
|
||||
statType = sc.MetricConfig[i].Footprint
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||
|
||||
jobMeta.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType)
|
||||
}
|
||||
|
||||
jobMeta.RawFootprint, err = json.Marshal(jobMeta.Footprint)
|
||||
if err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
cclog.Warn("Error while marshaling job footprint")
|
||||
return err
|
||||
}
|
||||
|
||||
if err := SanityChecks(&job.BaseJob); err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
jobMeta.EnergyFootprint = make(map[string]float64)
|
||||
|
||||
// Total Job Energy Outside Loop
|
||||
totalEnergy := 0.0
|
||||
for _, fp := range sc.EnergyFootprint {
|
||||
// Always Init Metric Energy Inside Loop
|
||||
metricEnergy := 0.0
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
|
||||
// Note: For DB data, calculate and save as kWh
|
||||
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules)
|
||||
cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp)
|
||||
// FIXME: Needs sum as stats type
|
||||
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
|
||||
// Energy: Power (in Watts) * Time (in Seconds)
|
||||
// Unit: (W * (s / 3600)) / 1000 = kWh
|
||||
// Round 2 Digits: round(Energy * 100) / 100
|
||||
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
|
||||
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
|
||||
rawEnergy := ((repository.LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0
|
||||
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)
|
||||
}
|
||||
|
||||
jobMeta.EnergyFootprint[fp] = metricEnergy
|
||||
totalEnergy += metricEnergy
|
||||
}
|
||||
|
||||
id, err := r.TransactionAdd(t, job)
|
||||
jobMeta.Energy = (math.Round(totalEnergy*100.0) / 100.0)
|
||||
if jobMeta.RawEnergyFootprint, err = json.Marshal(jobMeta.EnergyFootprint); err != nil {
|
||||
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
jobMeta.RawResources, err = json.Marshal(jobMeta.Resources)
|
||||
if err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
for _, tag := range job.Tags {
|
||||
jobMeta.RawMetaData, err = json.Marshal(jobMeta.MetaData)
|
||||
if err != nil {
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
if err := SanityChecks(jobMeta); err != nil {
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
id, err := r.TransactionAddNamed(t,
|
||||
repository.NamedJobInsert, jobMeta)
|
||||
if err != nil {
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
for _, tag := range jobMeta.Tags {
|
||||
tagstr := tag.Name + ":" + tag.Type
|
||||
tagId, ok := tags[tagstr]
|
||||
if !ok {
|
||||
tagId, err = r.TransactionAddTag(t, tag)
|
||||
tagId, err = r.TransactionAdd(t,
|
||||
addTagQuery,
|
||||
tag.Name, tag.Type)
|
||||
if err != nil {
|
||||
log.Errorf("Error adding tag: %v", err)
|
||||
cclog.Errorf("Error adding tag: %v", err)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
tags[tagstr] = tagId
|
||||
}
|
||||
|
||||
r.TransactionSetTag(t, id, tagId)
|
||||
r.TransactionAdd(t,
|
||||
setTagQuery,
|
||||
id, tagId)
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
@@ -117,21 +176,21 @@ func InitDB() error {
|
||||
}
|
||||
|
||||
if errorOccured > 0 {
|
||||
log.Warnf("Error in import of %d jobs!", errorOccured)
|
||||
cclog.Warnf("Error in import of %d jobs!", errorOccured)
|
||||
}
|
||||
|
||||
r.TransactionEnd(t)
|
||||
log.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
|
||||
cclog.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
|
||||
return nil
|
||||
}
|
||||
|
||||
// This function also sets the subcluster if necessary!
|
||||
func SanityChecks(job *schema.BaseJob) error {
|
||||
func SanityChecks(job *schema.Job) error {
|
||||
if c := archive.GetCluster(job.Cluster); c == nil {
|
||||
return fmt.Errorf("no such cluster: %v", job.Cluster)
|
||||
}
|
||||
if err := archive.AssignSubCluster(job); err != nil {
|
||||
log.Warn("Error while assigning subcluster to job")
|
||||
cclog.Warn("Error while assigning subcluster to job")
|
||||
return err
|
||||
}
|
||||
if !job.State.Valid() {
|
||||
@@ -150,18 +209,6 @@ func SanityChecks(job *schema.BaseJob) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func loadJobStat(job *schema.JobMeta, metric string) float64 {
|
||||
if stats, ok := job.Statistics[metric]; ok {
|
||||
if metric == "mem_used" {
|
||||
return stats.Max
|
||||
} else {
|
||||
return stats.Avg
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0
|
||||
}
|
||||
|
||||
func checkJobData(d *schema.JobData) error {
|
||||
for _, scopes := range *d {
|
||||
// var newUnit schema.Unit
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package importer
|
||||
@@ -7,7 +7,7 @@ package importer
|
||||
import (
|
||||
"math"
|
||||
|
||||
ccunits "github.com/ClusterCockpit/cc-units"
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
|
||||
)
|
||||
|
||||
func getNormalizationFactor(v float64) (float64, int) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package importer
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
ccunits "github.com/ClusterCockpit/cc-units"
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
|
||||
)
|
||||
|
||||
func TestNormalizeFactor(t *testing.T) {
|
||||
|
||||
1486
internal/importer/testdata/cluster-fritz.json
vendored
1486
internal/importer/testdata/cluster-fritz.json
vendored
File diff suppressed because it is too large
Load Diff
@@ -1 +1 @@
|
||||
{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"exclusive":1,"monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}}
|
||||
{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"shared":"none","monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}}
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"exclusive":1,"jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}}
|
||||
{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"shared":"none","jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}}
|
||||
|
||||
217
internal/memorystore/api.go
Normal file
217
internal/memorystore/api.go
Normal file
@@ -0,0 +1,217 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
type APIMetricData struct {
|
||||
Error *string `json:"error,omitempty"`
|
||||
Data schema.FloatArray `json:"data,omitempty"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
Resolution int64 `json:"resolution"`
|
||||
Avg schema.Float `json:"avg"`
|
||||
Min schema.Float `json:"min"`
|
||||
Max schema.Float `json:"max"`
|
||||
}
|
||||
|
||||
type APIQueryRequest struct {
|
||||
Cluster string `json:"cluster"`
|
||||
Queries []APIQuery `json:"queries"`
|
||||
ForAllNodes []string `json:"for-all-nodes"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
WithStats bool `json:"with-stats"`
|
||||
WithData bool `json:"with-data"`
|
||||
WithPadding bool `json:"with-padding"`
|
||||
}
|
||||
|
||||
type APIQueryResponse struct {
|
||||
Queries []APIQuery `json:"queries,omitempty"`
|
||||
Results [][]APIMetricData `json:"results"`
|
||||
}
|
||||
|
||||
type APIQuery struct {
|
||||
Type *string `json:"type,omitempty"`
|
||||
SubType *string `json:"subtype,omitempty"`
|
||||
Metric string `json:"metric"`
|
||||
Hostname string `json:"host"`
|
||||
Resolution int64 `json:"resolution"`
|
||||
TypeIds []string `json:"type-ids,omitempty"`
|
||||
SubTypeIds []string `json:"subtype-ids,omitempty"`
|
||||
ScaleFactor schema.Float `json:"scale-by,omitempty"`
|
||||
Aggregate bool `json:"aggreg"`
|
||||
}
|
||||
|
||||
// TODO: Optimize this, just like the stats endpoint!
|
||||
func (data *APIMetricData) AddStats() {
|
||||
n := 0
|
||||
sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64
|
||||
for _, x := range data.Data {
|
||||
if x.IsNaN() {
|
||||
continue
|
||||
}
|
||||
|
||||
n += 1
|
||||
sum += float64(x)
|
||||
min = math.Min(min, float64(x))
|
||||
max = math.Max(max, float64(x))
|
||||
}
|
||||
|
||||
if n > 0 {
|
||||
avg := sum / float64(n)
|
||||
data.Avg = schema.Float(avg)
|
||||
data.Min = schema.Float(min)
|
||||
data.Max = schema.Float(max)
|
||||
} else {
|
||||
data.Avg, data.Min, data.Max = schema.NaN, schema.NaN, schema.NaN
|
||||
}
|
||||
}
|
||||
|
||||
func (data *APIMetricData) ScaleBy(f schema.Float) {
|
||||
if f == 0 || f == 1 {
|
||||
return
|
||||
}
|
||||
|
||||
data.Avg *= f
|
||||
data.Min *= f
|
||||
data.Max *= f
|
||||
for i := 0; i < len(data.Data); i++ {
|
||||
data.Data[i] *= f
|
||||
}
|
||||
}
|
||||
|
||||
func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) {
|
||||
minfo, ok := ms.Metrics[metric]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
if (data.From / minfo.Frequency) > (from / minfo.Frequency) {
|
||||
padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency))
|
||||
ndata := make([]schema.Float, 0, padfront+len(data.Data))
|
||||
for range padfront {
|
||||
ndata = append(ndata, schema.NaN)
|
||||
}
|
||||
for j := 0; j < len(data.Data); j++ {
|
||||
ndata = append(ndata, data.Data[j])
|
||||
}
|
||||
data.Data = ndata
|
||||
}
|
||||
}
|
||||
|
||||
func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
|
||||
req.WithData = true
|
||||
req.WithData = true
|
||||
req.WithData = true
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
response := APIQueryResponse{
|
||||
Results: make([][]APIMetricData, 0, len(req.Queries)),
|
||||
}
|
||||
if req.ForAllNodes != nil {
|
||||
nodes := ms.ListChildren([]string{req.Cluster})
|
||||
for _, node := range nodes {
|
||||
for _, metric := range req.ForAllNodes {
|
||||
q := APIQuery{
|
||||
Metric: metric,
|
||||
Hostname: node,
|
||||
}
|
||||
req.Queries = append(req.Queries, q)
|
||||
response.Queries = append(response.Queries, q)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, query := range req.Queries {
|
||||
sels := make([]util.Selector, 0, 1)
|
||||
if query.Aggregate || query.Type == nil {
|
||||
sel := util.Selector{{String: req.Cluster}, {String: query.Hostname}}
|
||||
if query.Type != nil {
|
||||
if len(query.TypeIds) == 1 {
|
||||
sel = append(sel, util.SelectorElement{String: *query.Type + query.TypeIds[0]})
|
||||
} else {
|
||||
ids := make([]string, len(query.TypeIds))
|
||||
for i, id := range query.TypeIds {
|
||||
ids[i] = *query.Type + id
|
||||
}
|
||||
sel = append(sel, util.SelectorElement{Group: ids})
|
||||
}
|
||||
|
||||
if query.SubType != nil {
|
||||
if len(query.SubTypeIds) == 1 {
|
||||
sel = append(sel, util.SelectorElement{String: *query.SubType + query.SubTypeIds[0]})
|
||||
} else {
|
||||
ids := make([]string, len(query.SubTypeIds))
|
||||
for i, id := range query.SubTypeIds {
|
||||
ids[i] = *query.SubType + id
|
||||
}
|
||||
sel = append(sel, util.SelectorElement{Group: ids})
|
||||
}
|
||||
}
|
||||
}
|
||||
sels = append(sels, sel)
|
||||
} else {
|
||||
for _, typeID := range query.TypeIds {
|
||||
if query.SubType != nil {
|
||||
for _, subTypeID := range query.SubTypeIds {
|
||||
sels = append(sels, util.Selector{
|
||||
{String: req.Cluster},
|
||||
{String: query.Hostname},
|
||||
{String: *query.Type + typeID},
|
||||
{String: *query.SubType + subTypeID},
|
||||
})
|
||||
}
|
||||
} else {
|
||||
sels = append(sels, util.Selector{
|
||||
{String: req.Cluster},
|
||||
{String: query.Hostname},
|
||||
{String: *query.Type + typeID},
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// log.Printf("query: %#v\n", query)
|
||||
// log.Printf("sels: %#v\n", sels)
|
||||
var err error
|
||||
res := make([]APIMetricData, 0, len(sels))
|
||||
for _, sel := range sels {
|
||||
data := APIMetricData{}
|
||||
|
||||
data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution)
|
||||
if err != nil {
|
||||
msg := err.Error()
|
||||
data.Error = &msg
|
||||
res = append(res, data)
|
||||
continue
|
||||
}
|
||||
|
||||
if req.WithStats {
|
||||
data.AddStats()
|
||||
}
|
||||
if query.ScaleFactor != 0 {
|
||||
data.ScaleBy(query.ScaleFactor)
|
||||
}
|
||||
if req.WithPadding {
|
||||
data.PadDataWithNull(ms, req.From, req.To, query.Metric)
|
||||
}
|
||||
if !req.WithData {
|
||||
data.Data = nil
|
||||
}
|
||||
res = append(res, data)
|
||||
}
|
||||
response.Results = append(response.Results, res)
|
||||
}
|
||||
|
||||
return &response, nil
|
||||
}
|
||||
191
internal/memorystore/archive.go
Normal file
191
internal/memorystore/archive.go
Normal file
@@ -0,0 +1,191 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bufio"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func Archiving(wg *sync.WaitGroup, ctx context.Context) {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.Archive.Interval)
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
t := time.Now().Add(-d)
|
||||
cclog.Printf("[METRICSTORE]> start archiving checkpoints (older than %s)...\n", t.Format(time.RFC3339))
|
||||
n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir,
|
||||
Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead)
|
||||
|
||||
if err != nil {
|
||||
cclog.Printf("[METRICSTORE]> archiving failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Printf("[METRICSTORE]> done: %d files zipped and moved to archive\n", n)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var ErrNoNewArchiveData error = errors.New("all data already archived")
|
||||
|
||||
// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`,
|
||||
// deleting them from the `checkpointsDir`.
|
||||
func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) {
|
||||
entries1, err := os.ReadDir(checkpointsDir)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
type workItem struct {
|
||||
cdir, adir string
|
||||
cluster, host string
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
n, errs := int32(0), int32(0)
|
||||
work := make(chan workItem, NumWorkers)
|
||||
|
||||
wg.Add(NumWorkers)
|
||||
for worker := 0; worker < NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for workItem := range work {
|
||||
m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
}
|
||||
atomic.AddInt32(&n, int32(m))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for _, de1 := range entries1 {
|
||||
entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
|
||||
if e != nil {
|
||||
err = e
|
||||
}
|
||||
|
||||
for _, de2 := range entries2 {
|
||||
cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
|
||||
adir := filepath.Join(archiveDir, de1.Name(), de2.Name())
|
||||
work <- workItem{
|
||||
adir: adir, cdir: cdir,
|
||||
cluster: de1.Name(), host: de2.Name(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return int(n), err
|
||||
}
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("%d errors happend while archiving (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// Helper function for `ArchiveCheckpoints`.
|
||||
func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) {
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
extension := Keys.Checkpoints.FileFormat
|
||||
files, err := findFiles(entries, from, extension, false)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if deleteInstead {
|
||||
n := 0
|
||||
for _, checkpoint := range files {
|
||||
filename := filepath.Join(dir, checkpoint)
|
||||
if err = os.Remove(filename); err != nil {
|
||||
return n, err
|
||||
}
|
||||
n += 1
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from))
|
||||
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
err = os.MkdirAll(archiveDir, 0o755)
|
||||
if err == nil {
|
||||
f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
bw := bufio.NewWriter(f)
|
||||
defer bw.Flush()
|
||||
zw := zip.NewWriter(bw)
|
||||
defer zw.Close()
|
||||
|
||||
n := 0
|
||||
for _, checkpoint := range files {
|
||||
filename := filepath.Join(dir, checkpoint)
|
||||
r, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
w, err := zw.Create(checkpoint)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
|
||||
if _, err = io.Copy(w, r); err != nil {
|
||||
return n, err
|
||||
}
|
||||
|
||||
if err = os.Remove(filename); err != nil {
|
||||
return n, err
|
||||
}
|
||||
n += 1
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
482
internal/memorystore/avroCheckpoint.go
Normal file
482
internal/memorystore/avroCheckpoint.go
Normal file
@@ -0,0 +1,482 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/linkedin/goavro/v2"
|
||||
)
|
||||
|
||||
var NumAvroWorkers int = 4
|
||||
var startUp bool = true
|
||||
var ErrNoNewData error = errors.New("no data in the pool")
|
||||
|
||||
func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
|
||||
levels := make([]*AvroLevel, 0)
|
||||
selectors := make([][]string, 0)
|
||||
as.root.lock.RLock()
|
||||
// Cluster
|
||||
for sel1, l1 := range as.root.children {
|
||||
l1.lock.RLock()
|
||||
// Node
|
||||
for sel2, l2 := range l1.children {
|
||||
l2.lock.RLock()
|
||||
// Frequency
|
||||
for sel3, l3 := range l2.children {
|
||||
levels = append(levels, l3)
|
||||
selectors = append(selectors, []string{sel1, sel2, sel3})
|
||||
}
|
||||
l2.lock.RUnlock()
|
||||
}
|
||||
l1.lock.RUnlock()
|
||||
}
|
||||
as.root.lock.RUnlock()
|
||||
|
||||
type workItem struct {
|
||||
level *AvroLevel
|
||||
dir string
|
||||
selector []string
|
||||
}
|
||||
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(NumAvroWorkers)
|
||||
work := make(chan workItem, NumAvroWorkers*2)
|
||||
for range NumAvroWorkers {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
for workItem := range work {
|
||||
from := getTimestamp(workItem.dir)
|
||||
|
||||
if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil {
|
||||
if err == ErrNoNewArchiveData {
|
||||
continue
|
||||
}
|
||||
|
||||
cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
} else {
|
||||
atomic.AddInt32(&n, 1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for i := range len(levels) {
|
||||
dir := path.Join(dir, path.Join(selectors[i]...))
|
||||
work <- workItem{
|
||||
level: levels[i],
|
||||
dir: dir,
|
||||
selector: selectors[i],
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
|
||||
startUp = false
|
||||
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// getTimestamp returns the timestamp from the directory name
|
||||
func getTimestamp(dir string) int64 {
|
||||
// Extract the resolution and timestamp from the directory name
|
||||
// The existing avro file will be in epoch timestamp format
|
||||
// iterate over all the files in the directory and find the maximum timestamp
|
||||
// and return it
|
||||
|
||||
resolution := path.Base(dir)
|
||||
dir = path.Dir(dir)
|
||||
|
||||
files, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
var maxTS int64 = 0
|
||||
|
||||
if len(files) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
if file.IsDir() {
|
||||
continue
|
||||
}
|
||||
name := file.Name()
|
||||
|
||||
if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") {
|
||||
continue
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64)
|
||||
if err != nil {
|
||||
fmt.Printf("error while parsing timestamp: %s\n", err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
if ts > maxTS {
|
||||
maxTS = ts
|
||||
}
|
||||
}
|
||||
|
||||
interval, _ := time.ParseDuration(Keys.Checkpoints.Interval)
|
||||
updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix()
|
||||
|
||||
if startUp {
|
||||
return 0
|
||||
}
|
||||
|
||||
if updateTime < time.Now().Unix() {
|
||||
return 0
|
||||
}
|
||||
|
||||
return maxTS
|
||||
}
|
||||
|
||||
func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
// fmt.Printf("Checkpointing directory: %s\n", dir)
|
||||
// filepath contains the resolution
|
||||
intRes, _ := strconv.Atoi(path.Base(dir))
|
||||
|
||||
// find smallest overall timestamp in l.data map and delete it from l.data
|
||||
minTS := int64(1<<63 - 1)
|
||||
for ts, dat := range l.data {
|
||||
if ts < minTS && len(dat) != 0 {
|
||||
minTS = ts
|
||||
}
|
||||
}
|
||||
|
||||
if from == 0 && minTS != int64(1<<63-1) {
|
||||
from = minTS
|
||||
}
|
||||
|
||||
if from == 0 {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
var schema string
|
||||
var codec *goavro.Codec
|
||||
recordList := make([]map[string]any, 0)
|
||||
|
||||
var f *os.File
|
||||
|
||||
filePath := dir + fmt.Sprintf("_%d.avro", from)
|
||||
|
||||
var err error
|
||||
|
||||
fp_, err_ := os.Stat(filePath)
|
||||
if errors.Is(err_, os.ErrNotExist) {
|
||||
err = os.MkdirAll(path.Dir(dir), 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create directory: %v", err)
|
||||
}
|
||||
} else if fp_.Size() != 0 {
|
||||
f, err = os.Open(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open existing avro file: %v", err)
|
||||
}
|
||||
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
reader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF reader: %v", err)
|
||||
}
|
||||
codec = reader.Codec()
|
||||
schema = codec.Schema()
|
||||
|
||||
f.Close()
|
||||
}
|
||||
|
||||
timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix()
|
||||
|
||||
if dumpAll {
|
||||
timeRef = time.Now().Unix()
|
||||
}
|
||||
|
||||
// Empty values
|
||||
if len(l.data) == 0 {
|
||||
// we checkpoint avro files every 60 seconds
|
||||
repeat := 60 / intRes
|
||||
|
||||
for range repeat {
|
||||
recordList = append(recordList, make(map[string]any))
|
||||
}
|
||||
}
|
||||
|
||||
readFlag := true
|
||||
|
||||
for ts := range l.data {
|
||||
flag := false
|
||||
if ts < timeRef {
|
||||
data := l.data[ts]
|
||||
|
||||
schemaGen, err := generateSchema(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
flag, schema, err = compareSchema(schema, schemaGen)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compare read and generated schema: %v", err)
|
||||
}
|
||||
if flag && readFlag && !errors.Is(err_, os.ErrNotExist) {
|
||||
|
||||
f.Close()
|
||||
|
||||
f, err = os.Open(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open Avro file: %v", err)
|
||||
}
|
||||
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
ocfReader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF reader while changing schema: %v", err)
|
||||
}
|
||||
|
||||
for ocfReader.Scan() {
|
||||
record, err := ocfReader.Read()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read record: %v", err)
|
||||
}
|
||||
|
||||
recordList = append(recordList, record.(map[string]any))
|
||||
}
|
||||
|
||||
f.Close()
|
||||
|
||||
err = os.Remove(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete file: %v", err)
|
||||
}
|
||||
|
||||
readFlag = false
|
||||
}
|
||||
codec, err = goavro.NewCodec(schema)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create codec after merged schema: %v", err)
|
||||
}
|
||||
|
||||
recordList = append(recordList, generateRecord(data))
|
||||
delete(l.data, ts)
|
||||
}
|
||||
}
|
||||
|
||||
if len(recordList) == 0 {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to append new avro file: %v", err)
|
||||
}
|
||||
|
||||
// fmt.Printf("Codec : %#v\n", codec)
|
||||
|
||||
writer, err := goavro.NewOCFWriter(goavro.OCFConfig{
|
||||
W: f,
|
||||
Codec: codec,
|
||||
CompressionName: goavro.CompressionDeflateLabel,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF writer: %v", err)
|
||||
}
|
||||
|
||||
// Append the new record
|
||||
if err := writer.Append(recordList); err != nil {
|
||||
return fmt.Errorf("failed to append record: %v", err)
|
||||
}
|
||||
|
||||
f.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareSchema(schemaRead, schemaGen string) (bool, string, error) {
|
||||
var genSchema, readSchema AvroSchema
|
||||
|
||||
if schemaRead == "" {
|
||||
return false, schemaGen, nil
|
||||
}
|
||||
|
||||
// Unmarshal the schema strings into AvroSchema structs
|
||||
if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil {
|
||||
return false, "", fmt.Errorf("failed to parse generated schema: %v", err)
|
||||
}
|
||||
if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil {
|
||||
return false, "", fmt.Errorf("failed to parse read schema: %v", err)
|
||||
}
|
||||
|
||||
sort.Slice(genSchema.Fields, func(i, j int) bool {
|
||||
return genSchema.Fields[i].Name < genSchema.Fields[j].Name
|
||||
})
|
||||
|
||||
sort.Slice(readSchema.Fields, func(i, j int) bool {
|
||||
return readSchema.Fields[i].Name < readSchema.Fields[j].Name
|
||||
})
|
||||
|
||||
// Check if schemas are identical
|
||||
schemasEqual := true
|
||||
if len(genSchema.Fields) <= len(readSchema.Fields) {
|
||||
|
||||
for i := range genSchema.Fields {
|
||||
if genSchema.Fields[i].Name != readSchema.Fields[i].Name {
|
||||
schemasEqual = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// If schemas are identical, return the read schema
|
||||
if schemasEqual {
|
||||
return false, schemaRead, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Create a map to hold unique fields from both schemas
|
||||
fieldMap := make(map[string]AvroField)
|
||||
|
||||
// Add fields from the read schema
|
||||
for _, field := range readSchema.Fields {
|
||||
fieldMap[field.Name] = field
|
||||
}
|
||||
|
||||
// Add or update fields from the generated schema
|
||||
for _, field := range genSchema.Fields {
|
||||
fieldMap[field.Name] = field
|
||||
}
|
||||
|
||||
// Create a union schema by collecting fields from the map
|
||||
var mergedFields []AvroField
|
||||
for _, field := range fieldMap {
|
||||
mergedFields = append(mergedFields, field)
|
||||
}
|
||||
|
||||
// Sort fields by name for consistency
|
||||
sort.Slice(mergedFields, func(i, j int) bool {
|
||||
return mergedFields[i].Name < mergedFields[j].Name
|
||||
})
|
||||
|
||||
// Create the merged schema
|
||||
mergedSchema := AvroSchema{
|
||||
Type: "record",
|
||||
Name: genSchema.Name,
|
||||
Fields: mergedFields,
|
||||
}
|
||||
|
||||
// Check if schemas are identical
|
||||
schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields)
|
||||
if schemasEqual {
|
||||
for i := range mergedSchema.Fields {
|
||||
if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name {
|
||||
schemasEqual = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if schemasEqual {
|
||||
return false, schemaRead, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Marshal the merged schema back to JSON
|
||||
mergedSchemaJSON, err := json.Marshal(mergedSchema)
|
||||
if err != nil {
|
||||
return false, "", fmt.Errorf("failed to marshal merged schema: %v", err)
|
||||
}
|
||||
|
||||
return true, string(mergedSchemaJSON), nil
|
||||
}
|
||||
|
||||
func generateSchema(data map[string]schema.Float) (string, error) {
|
||||
// Define the Avro schema structure
|
||||
schema := map[string]any{
|
||||
"type": "record",
|
||||
"name": "DataRecord",
|
||||
"fields": []map[string]any{},
|
||||
}
|
||||
|
||||
fieldTracker := make(map[string]struct{})
|
||||
|
||||
for key := range data {
|
||||
if _, exists := fieldTracker[key]; !exists {
|
||||
key = correctKey(key)
|
||||
|
||||
field := map[string]any{
|
||||
"name": key,
|
||||
"type": "double",
|
||||
"default": -1.0,
|
||||
}
|
||||
schema["fields"] = append(schema["fields"].([]map[string]any), field)
|
||||
fieldTracker[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
schemaString, err := json.Marshal(schema)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to marshal schema: %v", err)
|
||||
}
|
||||
|
||||
return string(schemaString), nil
|
||||
}
|
||||
|
||||
func generateRecord(data map[string]schema.Float) map[string]any {
|
||||
record := make(map[string]any)
|
||||
|
||||
// Iterate through each map in data
|
||||
for key, value := range data {
|
||||
key = correctKey(key)
|
||||
|
||||
// Set the value in the record
|
||||
// avro only accepts basic types
|
||||
record[key] = value.Double()
|
||||
}
|
||||
|
||||
return record
|
||||
}
|
||||
|
||||
func correctKey(key string) string {
|
||||
// Replace any invalid characters in the key
|
||||
// For example, replace spaces with underscores
|
||||
key = strings.ReplaceAll(key, ":", "___")
|
||||
key = strings.ReplaceAll(key, ".", "__")
|
||||
|
||||
return key
|
||||
}
|
||||
|
||||
func ReplaceKey(key string) string {
|
||||
// Replace any invalid characters in the key
|
||||
// For example, replace spaces with underscores
|
||||
key = strings.ReplaceAll(key, "___", ":")
|
||||
key = strings.ReplaceAll(key, "__", ".")
|
||||
|
||||
return key
|
||||
}
|
||||
84
internal/memorystore/avroHelper.go
Normal file
84
internal/memorystore/avroHelper.go
Normal file
@@ -0,0 +1,84 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"slices"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
// AvroPool is a pool of Avro writers.
|
||||
go func() {
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
wg.Done() // Mark this goroutine as done
|
||||
return // Exit the goroutine
|
||||
}
|
||||
|
||||
defer wg.Done()
|
||||
|
||||
var avroLevel *AvroLevel
|
||||
oldSelector := make([]string, 0)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case val := <-LineProtocolMessages:
|
||||
// Fetch the frequency of the metric from the global configuration
|
||||
freq, err := GetMetricFrequency(val.MetricName)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error fetching metric frequency: %s\n", err)
|
||||
continue
|
||||
}
|
||||
|
||||
metricName := ""
|
||||
|
||||
for _, selectorName := range val.Selector {
|
||||
metricName += selectorName + Delimiter
|
||||
}
|
||||
|
||||
metricName += val.MetricName
|
||||
|
||||
// Create a new selector for the Avro level
|
||||
// The selector is a slice of strings that represents the path to the
|
||||
// Avro level. It is created by appending the cluster, node, and metric
|
||||
// name to the selector.
|
||||
var selector []string
|
||||
selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
|
||||
|
||||
if !testEq(oldSelector, selector) {
|
||||
// Get the Avro level for the metric
|
||||
avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
|
||||
|
||||
// If the Avro level is nil, create a new one
|
||||
if avroLevel == nil {
|
||||
cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
|
||||
}
|
||||
oldSelector = slices.Clone(selector)
|
||||
}
|
||||
|
||||
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func testEq(a, b []string) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i := range a {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
168
internal/memorystore/avroStruct.go
Normal file
168
internal/memorystore/avroStruct.go
Normal file
@@ -0,0 +1,168 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
LineProtocolMessages = make(chan *AvroStruct)
|
||||
Delimiter = "ZZZZZ"
|
||||
)
|
||||
|
||||
// CheckpointBufferMinutes should always be in minutes.
|
||||
// Its controls the amount of data to hold for given amount of time.
|
||||
var CheckpointBufferMinutes = 3
|
||||
|
||||
type AvroStruct struct {
|
||||
MetricName string
|
||||
Cluster string
|
||||
Node string
|
||||
Selector []string
|
||||
Value schema.Float
|
||||
Timestamp int64
|
||||
}
|
||||
|
||||
type AvroStore struct {
|
||||
root AvroLevel
|
||||
}
|
||||
|
||||
var avroStore AvroStore
|
||||
|
||||
type AvroLevel struct {
|
||||
children map[string]*AvroLevel
|
||||
data map[int64]map[string]schema.Float
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
type AvroField struct {
|
||||
Name string `json:"name"`
|
||||
Type any `json:"type"`
|
||||
Default any `json:"default,omitempty"`
|
||||
}
|
||||
|
||||
type AvroSchema struct {
|
||||
Type string `json:"type"`
|
||||
Name string `json:"name"`
|
||||
Fields []AvroField `json:"fields"`
|
||||
}
|
||||
|
||||
func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
// Allow concurrent reads:
|
||||
l.lock.RLock()
|
||||
var child *AvroLevel
|
||||
var ok bool
|
||||
if l.children == nil {
|
||||
// Children map needs to be created...
|
||||
l.lock.RUnlock()
|
||||
} else {
|
||||
child, ok := l.children[selector[0]]
|
||||
l.lock.RUnlock()
|
||||
if ok {
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
}
|
||||
|
||||
// The level does not exist, take write lock for unqiue access:
|
||||
l.lock.Lock()
|
||||
// While this thread waited for the write lock, another thread
|
||||
// could have created the child node.
|
||||
if l.children != nil {
|
||||
child, ok = l.children[selector[0]]
|
||||
if ok {
|
||||
l.lock.Unlock()
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
}
|
||||
|
||||
child = &AvroLevel{
|
||||
data: make(map[int64]map[string]schema.Float, 0),
|
||||
children: nil,
|
||||
}
|
||||
|
||||
if l.children != nil {
|
||||
l.children[selector[0]] = child
|
||||
} else {
|
||||
l.children = map[string]*AvroLevel{selector[0]: child}
|
||||
}
|
||||
l.lock.Unlock()
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
|
||||
func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
KeyCounter := int(CheckpointBufferMinutes * 60 / Freq)
|
||||
|
||||
// Create keys in advance for the given amount of time
|
||||
if len(l.data) != KeyCounter {
|
||||
if len(l.data) == 0 {
|
||||
for i := range KeyCounter {
|
||||
l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0)
|
||||
}
|
||||
} else {
|
||||
// Get the last timestamp
|
||||
var lastTS int64
|
||||
for ts := range l.data {
|
||||
if ts > lastTS {
|
||||
lastTS = ts
|
||||
}
|
||||
}
|
||||
// Create keys for the next KeyCounter timestamps
|
||||
l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0)
|
||||
}
|
||||
}
|
||||
|
||||
closestTS := int64(0)
|
||||
minDiff := int64(Freq) + 1 // Start with diff just outside the valid range
|
||||
found := false
|
||||
|
||||
// Iterate over timestamps and choose the one which is within range.
|
||||
// Since its epoch time, we check if the difference is less than 60 seconds.
|
||||
for ts, dat := range l.data {
|
||||
// Check if timestamp is within range
|
||||
diff := timestamp - ts
|
||||
if diff < -int64(Freq) || diff > int64(Freq) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Metric already present at this timestamp — skip
|
||||
if _, ok := dat[metricName]; ok {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if this is the closest timestamp so far
|
||||
if Abs(diff) < minDiff {
|
||||
minDiff = Abs(diff)
|
||||
closestTS = ts
|
||||
found = true
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
l.data[closestTS][metricName] = value
|
||||
}
|
||||
}
|
||||
|
||||
func GetAvroStore() *AvroStore {
|
||||
return &avroStore
|
||||
}
|
||||
|
||||
// Abs returns the absolute value of x.
|
||||
func Abs(x int64) int64 {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
238
internal/memorystore/buffer.go
Normal file
238
internal/memorystore/buffer.go
Normal file
@@ -0,0 +1,238 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// Default buffer capacity.
|
||||
// `buffer.data` will only ever grow up to it's capacity and a new link
|
||||
// in the buffer chain will be created if needed so that no copying
|
||||
// of data or reallocation needs to happen on writes.
|
||||
const (
|
||||
BufferCap int = 512
|
||||
)
|
||||
|
||||
// So that we can reuse allocations
|
||||
var bufferPool sync.Pool = sync.Pool{
|
||||
New: func() any {
|
||||
return &buffer{
|
||||
data: make([]schema.Float, 0, BufferCap),
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
var (
|
||||
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
|
||||
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
|
||||
)
|
||||
|
||||
// Each metric on each level has it's own buffer.
|
||||
// This is where the actual values go.
|
||||
// If `cap(data)` is reached, a new buffer is created and
|
||||
// becomes the new head of a buffer list.
|
||||
type buffer struct {
|
||||
prev *buffer
|
||||
next *buffer
|
||||
data []schema.Float
|
||||
frequency int64
|
||||
start int64
|
||||
archived bool
|
||||
closed bool
|
||||
}
|
||||
|
||||
func newBuffer(ts, freq int64) *buffer {
|
||||
b := bufferPool.Get().(*buffer)
|
||||
b.frequency = freq
|
||||
b.start = ts - (freq / 2)
|
||||
b.prev = nil
|
||||
b.next = nil
|
||||
b.archived = false
|
||||
b.closed = false
|
||||
b.data = b.data[:0]
|
||||
return b
|
||||
}
|
||||
|
||||
// If a new buffer was created, the new head is returnd.
|
||||
// Otherwise, the existing buffer is returnd.
|
||||
// Normaly, only "newer" data should be written, but if the value would
|
||||
// end up in the same buffer anyways it is allowed.
|
||||
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
|
||||
if ts < b.start {
|
||||
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
|
||||
}
|
||||
|
||||
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
|
||||
idx := int((ts - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
newbuf := newBuffer(ts, b.frequency)
|
||||
newbuf.prev = b
|
||||
b.next = newbuf
|
||||
b.close()
|
||||
b = newbuf
|
||||
idx = 0
|
||||
}
|
||||
|
||||
// Overwriting value or writing value from past
|
||||
if idx < len(b.data) {
|
||||
b.data[idx] = value
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// Fill up unwritten slots with NaN
|
||||
for i := len(b.data); i < idx; i++ {
|
||||
b.data = append(b.data, schema.NaN)
|
||||
}
|
||||
|
||||
b.data = append(b.data, value)
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (b *buffer) end() int64 {
|
||||
return b.firstWrite() + int64(len(b.data))*b.frequency
|
||||
}
|
||||
|
||||
func (b *buffer) firstWrite() int64 {
|
||||
return b.start + (b.frequency / 2)
|
||||
}
|
||||
|
||||
func (b *buffer) close() {}
|
||||
|
||||
/*
|
||||
func (b *buffer) close() {
|
||||
if b.closed {
|
||||
return
|
||||
}
|
||||
|
||||
b.closed = true
|
||||
n, sum, min, max := 0, 0., math.MaxFloat64, -math.MaxFloat64
|
||||
for _, x := range b.data {
|
||||
if x.IsNaN() {
|
||||
continue
|
||||
}
|
||||
|
||||
n += 1
|
||||
f := float64(x)
|
||||
sum += f
|
||||
min = math.Min(min, f)
|
||||
max = math.Max(max, f)
|
||||
}
|
||||
|
||||
b.statisticts.samples = n
|
||||
if n > 0 {
|
||||
b.statisticts.avg = Float(sum / float64(n))
|
||||
b.statisticts.min = Float(min)
|
||||
b.statisticts.max = Float(max)
|
||||
} else {
|
||||
b.statisticts.avg = NaN
|
||||
b.statisticts.min = NaN
|
||||
b.statisticts.max = NaN
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// func interpolate(idx int, data []Float) Float {
|
||||
// if idx == 0 || idx+1 == len(data) {
|
||||
// return NaN
|
||||
// }
|
||||
// return (data[idx-1] + data[idx+1]) / 2.0
|
||||
// }
|
||||
|
||||
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
|
||||
// Simple linear interpolation is done between the two neighboring cells if possible.
|
||||
// If values at the start or end are missing, instead of NaN values, the second and thrid
|
||||
// return values contain the actual `from`/`to`.
|
||||
// This function goes back the buffer chain if `from` is older than the currents buffer start.
|
||||
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
|
||||
// If `data` is not long enough to hold all values, this function will panic!
|
||||
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
|
||||
if from < b.firstWrite() {
|
||||
if b.prev != nil {
|
||||
return b.prev.read(from, to, data)
|
||||
}
|
||||
from = b.firstWrite()
|
||||
}
|
||||
|
||||
i := 0
|
||||
t := from
|
||||
for ; t < to; t += b.frequency {
|
||||
idx := int((t - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
if b.next == nil {
|
||||
break
|
||||
}
|
||||
b = b.next
|
||||
idx = 0
|
||||
}
|
||||
|
||||
if idx >= len(b.data) {
|
||||
if b.next == nil || to <= b.next.start {
|
||||
break
|
||||
}
|
||||
data[i] += schema.NaN
|
||||
} else if t < b.start {
|
||||
data[i] += schema.NaN
|
||||
// } else if b.data[idx].IsNaN() {
|
||||
// data[i] += interpolate(idx, b.data)
|
||||
} else {
|
||||
data[i] += b.data[idx]
|
||||
}
|
||||
i++
|
||||
}
|
||||
|
||||
return data[:i], from, t, nil
|
||||
}
|
||||
|
||||
// Returns true if this buffer needs to be freed.
|
||||
func (b *buffer) free(t int64) (delme bool, n int) {
|
||||
if b.prev != nil {
|
||||
delme, m := b.prev.free(t)
|
||||
n += m
|
||||
if delme {
|
||||
b.prev.next = nil
|
||||
if cap(b.prev.data) == BufferCap {
|
||||
bufferPool.Put(b.prev)
|
||||
}
|
||||
b.prev = nil
|
||||
}
|
||||
}
|
||||
|
||||
end := b.end()
|
||||
if end < t {
|
||||
return true, n + 1
|
||||
}
|
||||
|
||||
return false, n
|
||||
}
|
||||
|
||||
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
|
||||
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
|
||||
if b == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := b.prev.iterFromTo(from, to, callback); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if from <= b.end() && b.start <= to {
|
||||
return callback(b)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *buffer) count() int64 {
|
||||
res := int64(len(b.data))
|
||||
if b.prev != nil {
|
||||
res += b.prev.count()
|
||||
}
|
||||
return res
|
||||
}
|
||||
768
internal/memorystore/checkpoint.go
Normal file
768
internal/memorystore/checkpoint.go
Normal file
@@ -0,0 +1,768 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/linkedin/goavro/v2"
|
||||
)
|
||||
|
||||
// Whenever changed, update MarshalJSON as well!
|
||||
type CheckpointMetrics struct {
|
||||
Data []schema.Float `json:"data"`
|
||||
Frequency int64 `json:"frequency"`
|
||||
Start int64 `json:"start"`
|
||||
}
|
||||
|
||||
type CheckpointFile struct {
|
||||
Metrics map[string]*CheckpointMetrics `json:"metrics"`
|
||||
Children map[string]*CheckpointFile `json:"children"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
}
|
||||
|
||||
var lastCheckpoint time.Time
|
||||
|
||||
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
|
||||
lastCheckpoint = time.Now()
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
ms := GetMemoryStore()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.Checkpoints.Interval)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
cclog.Printf("[METRICSTORE]> start checkpointing (starting at %s)...\n", lastCheckpoint.Format(time.RFC3339))
|
||||
now := time.Now()
|
||||
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
|
||||
lastCheckpoint.Unix(), now.Unix())
|
||||
if err != nil {
|
||||
cclog.Printf("[METRICSTORE]> checkpointing failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Printf("[METRICSTORE]> done: %d checkpoint files created\n", n)
|
||||
lastCheckpoint = now
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
} else {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, _ := time.ParseDuration("1m")
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute):
|
||||
// This is the first tick untill we collect the data for given minutes.
|
||||
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
|
||||
// log.Printf("Checkpointing %d avro files", count)
|
||||
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
// Regular ticks of 1 minute to write data.
|
||||
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
|
||||
// log.Printf("Checkpointing %d avro files", count)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// As `Float` implements a custom MarshalJSON() function,
|
||||
// serializing an array of such types has more overhead
|
||||
// than one would assume (because of extra allocations, interfaces and so on).
|
||||
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
|
||||
buf := make([]byte, 0, 128+len(cm.Data)*8)
|
||||
buf = append(buf, `{"frequency":`...)
|
||||
buf = strconv.AppendInt(buf, cm.Frequency, 10)
|
||||
buf = append(buf, `,"start":`...)
|
||||
buf = strconv.AppendInt(buf, cm.Start, 10)
|
||||
buf = append(buf, `,"data":[`...)
|
||||
for i, x := range cm.Data {
|
||||
if i != 0 {
|
||||
buf = append(buf, ',')
|
||||
}
|
||||
if x.IsNaN() {
|
||||
buf = append(buf, `null`...)
|
||||
} else {
|
||||
buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32)
|
||||
}
|
||||
}
|
||||
buf = append(buf, `]}`...)
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
|
||||
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
|
||||
// The good thing: Only a host at a time is locked, so this function can run
|
||||
// in parallel to writes/reads.
|
||||
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
|
||||
levels := make([]*Level, 0)
|
||||
selectors := make([][]string, 0)
|
||||
m.root.lock.RLock()
|
||||
for sel1, l1 := range m.root.children {
|
||||
l1.lock.RLock()
|
||||
for sel2, l2 := range l1.children {
|
||||
levels = append(levels, l2)
|
||||
selectors = append(selectors, []string{sel1, sel2})
|
||||
}
|
||||
l1.lock.RUnlock()
|
||||
}
|
||||
m.root.lock.RUnlock()
|
||||
|
||||
type workItem struct {
|
||||
level *Level
|
||||
dir string
|
||||
selector []string
|
||||
}
|
||||
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(NumWorkers)
|
||||
work := make(chan workItem, NumWorkers*2)
|
||||
for worker := 0; worker < NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
for workItem := range work {
|
||||
if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
|
||||
if err == ErrNoNewArchiveData {
|
||||
continue
|
||||
}
|
||||
|
||||
cclog.Printf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
} else {
|
||||
atomic.AddInt32(&n, 1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for i := 0; i < len(levels); i++ {
|
||||
dir := path.Join(dir, path.Join(selectors[i]...))
|
||||
work <- workItem{
|
||||
level: levels[i],
|
||||
dir: dir,
|
||||
selector: selectors[i],
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happend while creating checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
retval := &CheckpointFile{
|
||||
From: from,
|
||||
To: to,
|
||||
Metrics: make(map[string]*CheckpointMetrics),
|
||||
Children: make(map[string]*CheckpointFile),
|
||||
}
|
||||
|
||||
for metric, minfo := range m.Metrics {
|
||||
b := l.metrics[minfo.offset]
|
||||
if b == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
allArchived := true
|
||||
b.iterFromTo(from, to, func(b *buffer) error {
|
||||
if !b.archived {
|
||||
allArchived = false
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if allArchived {
|
||||
continue
|
||||
}
|
||||
|
||||
data := make([]schema.Float, (to-from)/b.frequency+1)
|
||||
data, start, end, err := b.read(from, to, data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := int((end - start) / b.frequency); i < len(data); i++ {
|
||||
data[i] = schema.NaN
|
||||
}
|
||||
|
||||
retval.Metrics[metric] = &CheckpointMetrics{
|
||||
Frequency: b.frequency,
|
||||
Start: start,
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
for name, child := range l.children {
|
||||
val, err := child.toCheckpointFile(from, to, m)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if val != nil {
|
||||
retval.Children[name] = val
|
||||
}
|
||||
}
|
||||
|
||||
if len(retval.Children) == 0 && len(retval.Metrics) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return retval, nil
|
||||
}
|
||||
|
||||
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
|
||||
cf, err := l.toCheckpointFile(from, to, m)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cf == nil {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
|
||||
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0o644)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
err = os.MkdirAll(dir, 0o755)
|
||||
if err == nil {
|
||||
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0o644)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
bw := bufio.NewWriter(f)
|
||||
if err = json.NewEncoder(bw).Encode(cf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return bw.Flush()
|
||||
}
|
||||
|
||||
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
|
||||
var wg sync.WaitGroup
|
||||
work := make(chan [2]string, NumWorkers)
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
wg.Add(NumWorkers)
|
||||
for worker := 0; worker < NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for host := range work {
|
||||
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
|
||||
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> error while loading checkpoints: %s", err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
}
|
||||
atomic.AddInt32(&n, int32(nn))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
i := 0
|
||||
clustersDir, err := os.ReadDir(dir)
|
||||
for _, clusterDir := range clustersDir {
|
||||
if !clusterDir.IsDir() {
|
||||
err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
|
||||
goto done
|
||||
}
|
||||
|
||||
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
|
||||
if e != nil {
|
||||
err = e
|
||||
goto done
|
||||
}
|
||||
|
||||
for _, hostDir := range hostsDir {
|
||||
if !hostDir.IsDir() {
|
||||
err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
|
||||
goto done
|
||||
}
|
||||
|
||||
i++
|
||||
if i%NumWorkers == 0 && i > 100 {
|
||||
// Forcing garbage collection runs here regulary during the loading of checkpoints
|
||||
// will decrease the total heap size after loading everything back to memory is done.
|
||||
// While loading data, the heap will grow fast, so the GC target size will double
|
||||
// almost always. By forcing GCs here, we can keep it growing more slowly so that
|
||||
// at the end, less memory is wasted.
|
||||
runtime.GC()
|
||||
}
|
||||
|
||||
work <- [2]string{clusterDir.Name(), hostDir.Name()}
|
||||
}
|
||||
}
|
||||
done:
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return int(n), err
|
||||
}
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happend while creating checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
|
||||
// This function can only be called once and before the very first write or read.
|
||||
// Different host's data is loaded to memory in parallel.
|
||||
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
|
||||
if _, err := os.Stat(dir); os.IsNotExist(err) {
|
||||
// The directory does not exist, so create it using os.MkdirAll()
|
||||
err := os.MkdirAll(dir, 0o755) // 0755 sets the permissions for the directory
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
|
||||
}
|
||||
cclog.Printf("[METRICSTORE]> %#v Directory created successfully.\n", dir)
|
||||
}
|
||||
|
||||
// Config read (replace with your actual config read)
|
||||
fileFormat := Keys.Checkpoints.FileFormat
|
||||
if fileFormat == "" {
|
||||
fileFormat = "avro"
|
||||
}
|
||||
|
||||
// Map to easily get the fallback format
|
||||
oppositeFormat := map[string]string{
|
||||
"json": "avro",
|
||||
"avro": "json",
|
||||
}
|
||||
|
||||
// First, attempt to load the specified format
|
||||
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Printf("[METRICSTORE]> Loading %s files because fileformat is %s\n", fileFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, fileFormat)
|
||||
}
|
||||
|
||||
// If not found, attempt the opposite format
|
||||
altFormat := oppositeFormat[fileFormat]
|
||||
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Printf("[METRICSTORE]> Loading %s files but fileformat is %s\n", altFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, altFormat)
|
||||
}
|
||||
|
||||
cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory")
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func checkFilesWithExtension(dir string, extension string) (bool, error) {
|
||||
found := false
|
||||
|
||||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
|
||||
}
|
||||
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
|
||||
found = true
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
|
||||
}
|
||||
|
||||
return found, nil
|
||||
}
|
||||
|
||||
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:]
|
||||
resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err)
|
||||
}
|
||||
|
||||
fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64)
|
||||
|
||||
// Same logic according to lineprotocol
|
||||
fromTimestamp -= (resolution / 2)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err)
|
||||
}
|
||||
|
||||
// fmt.Printf("File : %s with resolution : %d\n", fileName, resolution)
|
||||
|
||||
var recordCounter int64 = 0
|
||||
|
||||
// Create a new OCF reader from the buffered reader
|
||||
ocfReader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
metricsData := make(map[string]schema.FloatArray)
|
||||
|
||||
for ocfReader.Scan() {
|
||||
datum, err := ocfReader.Read()
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err)
|
||||
}
|
||||
|
||||
record, ok := datum.(map[string]any)
|
||||
if !ok {
|
||||
panic("[METRICSTORE]> failed to assert datum as map[string]interface{}")
|
||||
}
|
||||
|
||||
for key, value := range record {
|
||||
metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64)))
|
||||
}
|
||||
|
||||
recordCounter += 1
|
||||
}
|
||||
|
||||
to := (fromTimestamp + (recordCounter / (60 / resolution) * 60))
|
||||
if to < from {
|
||||
return nil
|
||||
}
|
||||
|
||||
for key, floatArray := range metricsData {
|
||||
metricName := ReplaceKey(key)
|
||||
|
||||
if strings.Contains(metricName, Delimiter) {
|
||||
subString := strings.Split(metricName, Delimiter)
|
||||
|
||||
lvl := l
|
||||
|
||||
for i := 0; i < len(subString)-1; i++ {
|
||||
|
||||
sel := subString[i]
|
||||
|
||||
if lvl.children == nil {
|
||||
lvl.children = make(map[string]*Level)
|
||||
}
|
||||
|
||||
child, ok := lvl.children[sel]
|
||||
if !ok {
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: nil,
|
||||
}
|
||||
lvl.children[sel] = child
|
||||
}
|
||||
lvl = child
|
||||
}
|
||||
|
||||
leafMetricName := subString[len(subString)-1]
|
||||
err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
|
||||
}
|
||||
} else {
|
||||
err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error {
|
||||
n := len(floatArray)
|
||||
b := &buffer{
|
||||
frequency: resolution,
|
||||
start: from,
|
||||
data: floatArray[0:n:n],
|
||||
prev: nil,
|
||||
next: nil,
|
||||
archived: true,
|
||||
}
|
||||
b.close()
|
||||
|
||||
minfo, ok := m.Metrics[metricName]
|
||||
if !ok {
|
||||
return nil
|
||||
// return errors.New("Unkown metric: " + name)
|
||||
}
|
||||
|
||||
prev := l.metrics[minfo.offset]
|
||||
if prev == nil {
|
||||
l.metrics[minfo.offset] = b
|
||||
} else {
|
||||
if prev.start > b.start {
|
||||
return errors.New("wooops")
|
||||
}
|
||||
|
||||
b.prev = prev
|
||||
prev.next = b
|
||||
|
||||
missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency))
|
||||
if missingCount > 0 {
|
||||
missingCount /= int(b.frequency)
|
||||
|
||||
for range missingCount {
|
||||
prev.data = append(prev.data, schema.NaN)
|
||||
}
|
||||
|
||||
prev.data = prev.data[0:len(prev.data):len(prev.data)]
|
||||
}
|
||||
}
|
||||
l.metrics[minfo.offset] = b
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
|
||||
br := bufio.NewReader(f)
|
||||
cf := &CheckpointFile{}
|
||||
if err := json.NewDecoder(br).Decode(cf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cf.To != 0 && cf.To < from {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := l.loadFile(cf, m); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
|
||||
for name, metric := range cf.Metrics {
|
||||
n := len(metric.Data)
|
||||
b := &buffer{
|
||||
frequency: metric.Frequency,
|
||||
start: metric.Start,
|
||||
data: metric.Data[0:n:n], // Space is wasted here :(
|
||||
prev: nil,
|
||||
next: nil,
|
||||
archived: true,
|
||||
}
|
||||
b.close()
|
||||
|
||||
minfo, ok := m.Metrics[name]
|
||||
if !ok {
|
||||
continue
|
||||
// return errors.New("Unkown metric: " + name)
|
||||
}
|
||||
|
||||
prev := l.metrics[minfo.offset]
|
||||
if prev == nil {
|
||||
l.metrics[minfo.offset] = b
|
||||
} else {
|
||||
if prev.start > b.start {
|
||||
return errors.New("wooops")
|
||||
}
|
||||
|
||||
b.prev = prev
|
||||
prev.next = b
|
||||
}
|
||||
l.metrics[minfo.offset] = b
|
||||
}
|
||||
|
||||
if len(cf.Children) > 0 && l.children == nil {
|
||||
l.children = make(map[string]*Level)
|
||||
}
|
||||
|
||||
for sel, childCf := range cf.Children {
|
||||
child, ok := l.children[sel]
|
||||
if !ok {
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: nil,
|
||||
}
|
||||
l.children[sel] = child
|
||||
}
|
||||
|
||||
if err := child.loadFile(childCf, m); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
|
||||
direntries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return 0, err
|
||||
}
|
||||
|
||||
allFiles := make([]fs.DirEntry, 0)
|
||||
filesLoaded := 0
|
||||
for _, e := range direntries {
|
||||
if e.IsDir() {
|
||||
child := &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: make(map[string]*Level),
|
||||
}
|
||||
|
||||
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
|
||||
filesLoaded += files
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
l.children[e.Name()] = child
|
||||
} else if strings.HasSuffix(e.Name(), "."+extension) {
|
||||
allFiles = append(allFiles, e)
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
files, err := findFiles(allFiles, from, extension, true)
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
|
||||
"json": l.loadJSONFile,
|
||||
"avro": l.loadAvroFile,
|
||||
}
|
||||
|
||||
loader := loaders[extension]
|
||||
|
||||
for _, filename := range files {
|
||||
f, err := os.Open(path.Join(dir, filename))
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if err = loader(m, f, from); err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
filesLoaded += 1
|
||||
}
|
||||
|
||||
return filesLoaded, nil
|
||||
}
|
||||
|
||||
// This will probably get very slow over time!
|
||||
// A solution could be some sort of an index file in which all other files
|
||||
// and the timespan they contain is listed.
|
||||
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
|
||||
nums := map[string]int64{}
|
||||
for _, e := range direntries {
|
||||
if !strings.HasSuffix(e.Name(), "."+extension) {
|
||||
continue
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nums[e.Name()] = ts
|
||||
}
|
||||
|
||||
sort.Slice(direntries, func(i, j int) bool {
|
||||
a, b := direntries[i], direntries[j]
|
||||
return nums[a.Name()] < nums[b.Name()]
|
||||
})
|
||||
|
||||
filenames := make([]string, 0)
|
||||
for i := range direntries {
|
||||
e := direntries[i]
|
||||
ts1 := nums[e.Name()]
|
||||
|
||||
if findMoreRecentFiles && t <= ts1 {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
if i == len(direntries)-1 {
|
||||
continue
|
||||
}
|
||||
|
||||
enext := direntries[i+1]
|
||||
ts2 := nums[enext.Name()]
|
||||
|
||||
if findMoreRecentFiles {
|
||||
if ts1 < t && t < ts2 {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
} else {
|
||||
if ts2 < t {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return filenames, nil
|
||||
}
|
||||
118
internal/memorystore/config.go
Normal file
118
internal/memorystore/config.go
Normal file
@@ -0,0 +1,118 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
var InternalCCMSFlag bool = false
|
||||
|
||||
type MetricStoreConfig struct {
|
||||
Checkpoints struct {
|
||||
FileFormat string `json:"file-format"`
|
||||
Interval string `json:"interval"`
|
||||
RootDir string `json:"directory"`
|
||||
Restore string `json:"restore"`
|
||||
} `json:"checkpoints"`
|
||||
Debug struct {
|
||||
DumpToFile string `json:"dump-to-file"`
|
||||
EnableGops bool `json:"gops"`
|
||||
} `json:"debug"`
|
||||
RetentionInMemory string `json:"retention-in-memory"`
|
||||
Archive struct {
|
||||
Interval string `json:"interval"`
|
||||
RootDir string `json:"directory"`
|
||||
DeleteInstead bool `json:"delete-instead"`
|
||||
} `json:"archive"`
|
||||
Nats []*NatsConfig `json:"nats"`
|
||||
}
|
||||
|
||||
type NatsConfig struct {
|
||||
// Address of the nats server
|
||||
Address string `json:"address"`
|
||||
|
||||
// Username/Password, optional
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
|
||||
// Creds file path
|
||||
Credsfilepath string `json:"creds-file-path"`
|
||||
|
||||
Subscriptions []struct {
|
||||
// Channel name
|
||||
SubscribeTo string `json:"subscribe-to"`
|
||||
|
||||
// Allow lines without a cluster tag, use this as default, optional
|
||||
ClusterTag string `json:"cluster-tag"`
|
||||
} `json:"subscriptions"`
|
||||
}
|
||||
|
||||
var Keys MetricStoreConfig
|
||||
|
||||
// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time!
|
||||
type AggregationStrategy int
|
||||
|
||||
const (
|
||||
NoAggregation AggregationStrategy = iota
|
||||
SumAggregation
|
||||
AvgAggregation
|
||||
)
|
||||
|
||||
func AssignAggregationStratergy(str string) (AggregationStrategy, error) {
|
||||
switch str {
|
||||
case "":
|
||||
return NoAggregation, nil
|
||||
case "sum":
|
||||
return SumAggregation, nil
|
||||
case "avg":
|
||||
return AvgAggregation, nil
|
||||
default:
|
||||
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
|
||||
}
|
||||
}
|
||||
|
||||
type MetricConfig struct {
|
||||
// Interval in seconds at which measurements are stored
|
||||
Frequency int64
|
||||
|
||||
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
|
||||
Aggregation AggregationStrategy
|
||||
|
||||
// Private, used internally...
|
||||
offset int
|
||||
}
|
||||
|
||||
var Metrics map[string]MetricConfig
|
||||
|
||||
func GetMetricFrequency(metricName string) (int64, error) {
|
||||
if metric, ok := Metrics[metricName]; ok {
|
||||
return metric.Frequency, nil
|
||||
}
|
||||
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
|
||||
}
|
||||
|
||||
// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency.
|
||||
// use metric.Name to check if the metric already exists.
|
||||
// if not, add it to the Metrics map.
|
||||
func AddMetric(name string, metric MetricConfig) error {
|
||||
if Metrics == nil {
|
||||
Metrics = make(map[string]MetricConfig, 0)
|
||||
}
|
||||
|
||||
if existingMetric, ok := Metrics[name]; ok {
|
||||
if existingMetric.Frequency != metric.Frequency {
|
||||
if existingMetric.Frequency < metric.Frequency {
|
||||
existingMetric.Frequency = metric.Frequency
|
||||
Metrics[name] = existingMetric
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Metrics[name] = metric
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
89
internal/memorystore/configSchema.go
Normal file
89
internal/memorystore/configSchema.go
Normal file
@@ -0,0 +1,89 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
const configSchema = `{
|
||||
"type": "object",
|
||||
"description": "Configuration specific to built-in metric-store.",
|
||||
"properties": {
|
||||
"checkpoints": {
|
||||
"description": "Configuration for checkpointing the metrics within metric-store",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file-format": {
|
||||
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
|
||||
"type": "string"
|
||||
},
|
||||
"interval": {
|
||||
"description": "Interval at which the metrics should be checkpointed.",
|
||||
"type": "string"
|
||||
},
|
||||
"directory": {
|
||||
"description": "Specify the parent directy in which the checkpointed files should be placed.",
|
||||
"type": "string"
|
||||
},
|
||||
"restore": {
|
||||
"description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"archive": {
|
||||
"description": "Configuration for archiving the already checkpointed files.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"interval": {
|
||||
"description": "Interval at which the checkpointed files should be archived.",
|
||||
"type": "string"
|
||||
},
|
||||
"directory": {
|
||||
"description": "Specify the parent directy in which the archived files should be placed.",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"retention-in-memory": {
|
||||
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
|
||||
"type": "string"
|
||||
},
|
||||
"nats": {
|
||||
"description": "Configuration for accepting published data through NATS.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"address": {
|
||||
"description": "Address of the NATS server.",
|
||||
"type": "string"
|
||||
},
|
||||
"username": {
|
||||
"description": "Optional: If configured with username/password method.",
|
||||
"type": "string"
|
||||
},
|
||||
"password": {
|
||||
"description": "Optional: If configured with username/password method.",
|
||||
"type": "string"
|
||||
},
|
||||
"creds-file-path": {
|
||||
"description": "Optional: If configured with Credential File method. Path to your NATS cred file.",
|
||||
"type": "string"
|
||||
},
|
||||
"subscriptions": {
|
||||
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"subscribe-to": {
|
||||
"description": "Channel name",
|
||||
"type": "string"
|
||||
},
|
||||
"cluster-tag": {
|
||||
"description": "Optional: Allow lines without a cluster tag, use this as default",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
112
internal/memorystore/debug.go
Normal file
112
internal/memorystore/debug.go
Normal file
@@ -0,0 +1,112 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func (b *buffer) debugDump(buf []byte) []byte {
|
||||
if b.prev != nil {
|
||||
buf = b.prev.debugDump(buf)
|
||||
}
|
||||
|
||||
start, len, end := b.start, len(b.data), b.start+b.frequency*int64(len(b.data))
|
||||
buf = append(buf, `{"start":`...)
|
||||
buf = strconv.AppendInt(buf, start, 10)
|
||||
buf = append(buf, `,"len":`...)
|
||||
buf = strconv.AppendInt(buf, int64(len), 10)
|
||||
buf = append(buf, `,"end":`...)
|
||||
buf = strconv.AppendInt(buf, end, 10)
|
||||
if b.archived {
|
||||
buf = append(buf, `,"saved":true`...)
|
||||
}
|
||||
if b.next != nil {
|
||||
buf = append(buf, `},`...)
|
||||
} else {
|
||||
buf = append(buf, `}`...)
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
for i := 0; i < depth; i++ {
|
||||
buf = append(buf, '\t')
|
||||
}
|
||||
buf = append(buf, '"')
|
||||
buf = append(buf, lvlname...)
|
||||
buf = append(buf, "\":{\n"...)
|
||||
depth += 1
|
||||
objitems := 0
|
||||
for name, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.offset]; b != nil {
|
||||
for i := 0; i < depth; i++ {
|
||||
buf = append(buf, '\t')
|
||||
}
|
||||
|
||||
buf = append(buf, '"')
|
||||
buf = append(buf, name...)
|
||||
buf = append(buf, `":[`...)
|
||||
buf = b.debugDump(buf)
|
||||
buf = append(buf, "],\n"...)
|
||||
objitems++
|
||||
}
|
||||
}
|
||||
|
||||
for name, lvl := range l.children {
|
||||
_, err := w.Write(buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
buf = buf[0:0]
|
||||
buf, err = lvl.debugDump(m, w, name, buf, depth)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
buf = append(buf, ',', '\n')
|
||||
objitems++
|
||||
}
|
||||
|
||||
// remove final `,`:
|
||||
if objitems > 0 {
|
||||
buf = append(buf[0:len(buf)-1], '\n')
|
||||
}
|
||||
|
||||
depth -= 1
|
||||
for i := 0; i < depth; i++ {
|
||||
buf = append(buf, '\t')
|
||||
}
|
||||
buf = append(buf, '}')
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) DebugDump(w *bufio.Writer, selector []string) error {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
|
||||
}
|
||||
|
||||
buf := make([]byte, 0, 2048)
|
||||
buf = append(buf, "{"...)
|
||||
|
||||
buf, err := lvl.debugDump(m, w, "data", buf, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
buf = append(buf, "}\n"...)
|
||||
if _, err = w.Write(buf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return w.Flush()
|
||||
}
|
||||
92
internal/memorystore/healthcheck.go
Normal file
92
internal/memorystore/healthcheck.go
Normal file
@@ -0,0 +1,92 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
|
||||
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
|
||||
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
|
||||
const MaxMissingDataPoints int64 = 5
|
||||
|
||||
// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly.
|
||||
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
|
||||
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
|
||||
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
|
||||
const MaxUnhealthyMetrics int64 = 5
|
||||
|
||||
func (b *buffer) healthCheck() int64 {
|
||||
// Check if the buffer is empty
|
||||
if b.data == nil {
|
||||
return 1
|
||||
}
|
||||
|
||||
bufferEnd := b.start + b.frequency*int64(len(b.data))
|
||||
t := time.Now().Unix()
|
||||
|
||||
// Check if the buffer is too old
|
||||
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
|
||||
return 1
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
for _, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.offset]; b != nil {
|
||||
count += b.healthCheck()
|
||||
}
|
||||
}
|
||||
|
||||
for _, lvl := range l.children {
|
||||
c, err := lvl.healthCheck(m, 0)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
count += c
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
|
||||
}
|
||||
|
||||
buf := make([]byte, 0, 25)
|
||||
// buf = append(buf, "{"...)
|
||||
|
||||
var count int64 = 0
|
||||
|
||||
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if unhealthyMetricsCount < MaxUnhealthyMetrics {
|
||||
buf = append(buf, "Healthy"...)
|
||||
} else {
|
||||
buf = append(buf, "Unhealthy"...)
|
||||
}
|
||||
|
||||
// buf = append(buf, "}\n"...)
|
||||
|
||||
if _, err = w.Write(buf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return w.Flush()
|
||||
}
|
||||
192
internal/memorystore/level.go
Normal file
192
internal/memorystore/level.go
Normal file
@@ -0,0 +1,192 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
// Could also be called "node" as this forms a node in a tree structure.
|
||||
// Called Level because "node" might be confusing here.
|
||||
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
|
||||
// also hold data (in `metrics`).
|
||||
type Level struct {
|
||||
children map[string]*Level
|
||||
metrics []*buffer
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
// Find the correct level for the given selector, creating it if
|
||||
// it does not exist. Example selector in the context of the
|
||||
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
|
||||
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
|
||||
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
// Allow concurrent reads:
|
||||
l.lock.RLock()
|
||||
var child *Level
|
||||
var ok bool
|
||||
if l.children == nil {
|
||||
// Children map needs to be created...
|
||||
l.lock.RUnlock()
|
||||
} else {
|
||||
child, ok := l.children[selector[0]]
|
||||
l.lock.RUnlock()
|
||||
if ok {
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
// The level does not exist, take write lock for unqiue access:
|
||||
l.lock.Lock()
|
||||
// While this thread waited for the write lock, another thread
|
||||
// could have created the child node.
|
||||
if l.children != nil {
|
||||
child, ok = l.children[selector[0]]
|
||||
if ok {
|
||||
l.lock.Unlock()
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, nMetrics),
|
||||
children: nil,
|
||||
}
|
||||
|
||||
if l.children != nil {
|
||||
l.children[selector[0]] = child
|
||||
} else {
|
||||
l.children = map[string]*Level{selector[0]: child}
|
||||
}
|
||||
l.lock.Unlock()
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
|
||||
func (l *Level) free(t int64) (int, error) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
n := 0
|
||||
for i, b := range l.metrics {
|
||||
if b != nil {
|
||||
delme, m := b.free(t)
|
||||
n += m
|
||||
if delme {
|
||||
if cap(b.data) == BufferCap {
|
||||
bufferPool.Put(b)
|
||||
}
|
||||
l.metrics[i] = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range l.children {
|
||||
m, err := l.free(t)
|
||||
n += m
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (l *Level) sizeInBytes() int64 {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
size := int64(0)
|
||||
|
||||
for _, b := range l.metrics {
|
||||
if b != nil {
|
||||
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
|
||||
}
|
||||
}
|
||||
|
||||
for _, child := range l.children {
|
||||
size += child.sizeInBytes()
|
||||
}
|
||||
|
||||
return size
|
||||
}
|
||||
|
||||
func (l *Level) findLevel(selector []string) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
lvl := l.children[selector[0]]
|
||||
if lvl == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return lvl.findLevel(selector[1:])
|
||||
}
|
||||
|
||||
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
if len(selector) == 0 {
|
||||
b := l.metrics[offset]
|
||||
if b != nil {
|
||||
return f(b)
|
||||
}
|
||||
|
||||
for _, lvl := range l.children {
|
||||
err := lvl.findBuffers(nil, offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
sel := selector[0]
|
||||
if len(sel.String) != 0 && l.children != nil {
|
||||
lvl, ok := l.children[sel.String]
|
||||
if ok {
|
||||
err := lvl.findBuffers(selector[1:], offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if sel.Group != nil && l.children != nil {
|
||||
for _, key := range sel.Group {
|
||||
lvl, ok := l.children[key]
|
||||
if ok {
|
||||
err := lvl.findBuffers(selector[1:], offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if sel.Any && l.children != nil {
|
||||
for _, lvl := range l.children {
|
||||
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
351
internal/memorystore/lineprotocol.go
Normal file
351
internal/memorystore/lineprotocol.go
Normal file
@@ -0,0 +1,351 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/influxdata/line-protocol/v2/lineprotocol"
|
||||
"github.com/nats-io/nats.go"
|
||||
)
|
||||
|
||||
// Each connection is handled in it's own goroutine. This is a blocking function.
|
||||
// func ReceiveRaw(ctx context.Context,
|
||||
// listener net.Listener,
|
||||
// handleLine func(*lineprotocol.Decoder, string) error,
|
||||
// ) error {
|
||||
// var wg sync.WaitGroup
|
||||
|
||||
// wg.Add(1)
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
// <-ctx.Done()
|
||||
// if err := listener.Close(); err != nil {
|
||||
// log.Printf("listener.Close(): %s", err.Error())
|
||||
// }
|
||||
// }()
|
||||
|
||||
// for {
|
||||
// conn, err := listener.Accept()
|
||||
// if err != nil {
|
||||
// if errors.Is(err, net.ErrClosed) {
|
||||
// break
|
||||
// }
|
||||
|
||||
// log.Printf("listener.Accept(): %s", err.Error())
|
||||
// }
|
||||
|
||||
// wg.Add(2)
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
// defer conn.Close()
|
||||
|
||||
// dec := lineprotocol.NewDecoder(conn)
|
||||
// connctx, cancel := context.WithCancel(context.Background())
|
||||
// defer cancel()
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
// select {
|
||||
// case <-connctx.Done():
|
||||
// conn.Close()
|
||||
// case <-ctx.Done():
|
||||
// conn.Close()
|
||||
// }
|
||||
// }()
|
||||
|
||||
// if err := handleLine(dec, "default"); err != nil {
|
||||
// if errors.Is(err, net.ErrClosed) {
|
||||
// return
|
||||
// }
|
||||
|
||||
// log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error())
|
||||
// errmsg := make([]byte, 128)
|
||||
// errmsg = append(errmsg, `error: `...)
|
||||
// errmsg = append(errmsg, err.Error()...)
|
||||
// errmsg = append(errmsg, '\n')
|
||||
// conn.Write(errmsg)
|
||||
// }
|
||||
// }()
|
||||
// }
|
||||
|
||||
// wg.Wait()
|
||||
// return nil
|
||||
// }
|
||||
|
||||
// ReceiveNats connects to a nats server and subscribes to "updates". This is a
|
||||
// blocking function. handleLine will be called for each line recieved via
|
||||
// nats. Send `true` through the done channel for gracefull termination.
|
||||
func ReceiveNats(conf *(NatsConfig),
|
||||
ms *MemoryStore,
|
||||
workers int,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
var opts []nats.Option
|
||||
if conf.Username != "" && conf.Password != "" {
|
||||
opts = append(opts, nats.UserInfo(conf.Username, conf.Password))
|
||||
}
|
||||
|
||||
if conf.Credsfilepath != "" {
|
||||
opts = append(opts, nats.UserCredentials(conf.Credsfilepath))
|
||||
}
|
||||
|
||||
nc, err := nats.Connect(conf.Address, opts...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer nc.Close()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
var subs []*nats.Subscription
|
||||
|
||||
msgs := make(chan *nats.Msg, workers*2)
|
||||
|
||||
for _, sc := range conf.Subscriptions {
|
||||
clusterTag := sc.ClusterTag
|
||||
var sub *nats.Subscription
|
||||
if workers > 1 {
|
||||
wg.Add(workers)
|
||||
|
||||
for range workers {
|
||||
go func() {
|
||||
for m := range msgs {
|
||||
dec := lineprotocol.NewDecoderWithBytes(m.Data)
|
||||
if err := DecodeLine(dec, ms, clusterTag); err != nil {
|
||||
cclog.Printf("error: %s\n", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
|
||||
sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
|
||||
msgs <- m
|
||||
})
|
||||
} else {
|
||||
sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
|
||||
dec := lineprotocol.NewDecoderWithBytes(m.Data)
|
||||
if err := DecodeLine(dec, ms, clusterTag); err != nil {
|
||||
cclog.Printf("error: %s\n", err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cclog.Printf("NATS subscription to '%s' on '%s' established\n", sc.SubscribeTo, conf.Address)
|
||||
subs = append(subs, sub)
|
||||
}
|
||||
|
||||
<-ctx.Done()
|
||||
for _, sub := range subs {
|
||||
err = sub.Unsubscribe()
|
||||
if err != nil {
|
||||
cclog.Printf("NATS unsubscribe failed: %s", err.Error())
|
||||
}
|
||||
}
|
||||
close(msgs)
|
||||
wg.Wait()
|
||||
|
||||
nc.Close()
|
||||
cclog.Print("NATS connection closed")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Place `prefix` in front of `buf` but if possible,
|
||||
// do that inplace in `buf`.
|
||||
func reorder(buf, prefix []byte) []byte {
|
||||
n := len(prefix)
|
||||
m := len(buf)
|
||||
if cap(buf) < m+n {
|
||||
return append(prefix[:n:n], buf...)
|
||||
} else {
|
||||
buf = buf[:n+m]
|
||||
for i := m - 1; i >= 0; i-- {
|
||||
buf[i+n] = buf[i]
|
||||
}
|
||||
for i := range n {
|
||||
buf[i] = prefix[i]
|
||||
}
|
||||
return buf
|
||||
}
|
||||
}
|
||||
|
||||
// Decode lines using dec and make write calls to the MemoryStore.
|
||||
// If a line is missing its cluster tag, use clusterDefault as default.
|
||||
func DecodeLine(dec *lineprotocol.Decoder,
|
||||
ms *MemoryStore,
|
||||
clusterDefault string,
|
||||
) error {
|
||||
// Reduce allocations in loop:
|
||||
t := time.Now()
|
||||
metric, metricBuf := Metric{}, make([]byte, 0, 16)
|
||||
selector := make([]string, 0, 4)
|
||||
typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0)
|
||||
|
||||
// Optimize for the case where all lines in a "batch" are about the same
|
||||
// cluster and host. By using `WriteToLevel` (level = host), we do not need
|
||||
// to take the root- and cluster-level lock as often.
|
||||
var lvl *Level = nil
|
||||
prevCluster, prevHost := "", ""
|
||||
|
||||
var ok bool
|
||||
for dec.Next() {
|
||||
rawmeasurement, err := dec.Measurement()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Needs to be copied because another call to dec.* would
|
||||
// invalidate the returned slice.
|
||||
metricBuf = append(metricBuf[:0], rawmeasurement...)
|
||||
|
||||
// The go compiler optimizes map[string(byteslice)] lookups:
|
||||
metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0]
|
||||
cluster, host := clusterDefault, ""
|
||||
for {
|
||||
key, val, err := dec.NextTag()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if key == nil {
|
||||
break
|
||||
}
|
||||
|
||||
// The go compiler optimizes string([]byte{...}) == "...":
|
||||
switch string(key) {
|
||||
case "cluster":
|
||||
if string(val) == prevCluster {
|
||||
cluster = prevCluster
|
||||
} else {
|
||||
cluster = string(val)
|
||||
lvl = nil
|
||||
}
|
||||
case "hostname", "host":
|
||||
if string(val) == prevHost {
|
||||
host = prevHost
|
||||
} else {
|
||||
host = string(val)
|
||||
lvl = nil
|
||||
}
|
||||
case "type":
|
||||
if string(val) == "node" {
|
||||
break
|
||||
}
|
||||
|
||||
// We cannot be sure that the "type" tag comes before the "type-id" tag:
|
||||
if len(typeBuf) == 0 {
|
||||
typeBuf = append(typeBuf, val...)
|
||||
} else {
|
||||
typeBuf = reorder(typeBuf, val)
|
||||
}
|
||||
case "type-id":
|
||||
typeBuf = append(typeBuf, val...)
|
||||
case "subtype":
|
||||
// We cannot be sure that the "subtype" tag comes before the "stype-id" tag:
|
||||
if len(subTypeBuf) == 0 {
|
||||
subTypeBuf = append(subTypeBuf, val...)
|
||||
} else {
|
||||
subTypeBuf = reorder(subTypeBuf, val)
|
||||
// subTypeBuf = reorder(typeBuf, val)
|
||||
}
|
||||
case "stype-id":
|
||||
subTypeBuf = append(subTypeBuf, val...)
|
||||
default:
|
||||
// Ignore unkown tags (cc-metric-collector might send us a unit for example that we do not need)
|
||||
// return fmt.Errorf("unkown tag: '%s' (value: '%s')", string(key), string(val))
|
||||
}
|
||||
}
|
||||
|
||||
// If the cluster or host changed, the lvl was set to nil
|
||||
if lvl == nil {
|
||||
selector = selector[:2]
|
||||
selector[0], selector[1] = cluster, host
|
||||
lvl = ms.GetLevel(selector)
|
||||
prevCluster, prevHost = cluster, host
|
||||
}
|
||||
|
||||
// subtypes:
|
||||
selector = selector[:0]
|
||||
if len(typeBuf) > 0 {
|
||||
selector = append(selector, string(typeBuf)) // <- Allocation :(
|
||||
if len(subTypeBuf) > 0 {
|
||||
selector = append(selector, string(subTypeBuf))
|
||||
}
|
||||
}
|
||||
|
||||
for {
|
||||
key, val, err := dec.NextField()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if key == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if string(key) != "value" {
|
||||
return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val)
|
||||
}
|
||||
|
||||
if val.Kind() == lineprotocol.Float {
|
||||
metric.Value = schema.Float(val.FloatV())
|
||||
} else if val.Kind() == lineprotocol.Int {
|
||||
metric.Value = schema.Float(val.IntV())
|
||||
} else if val.Kind() == lineprotocol.Uint {
|
||||
metric.Value = schema.Float(val.UintV())
|
||||
} else {
|
||||
return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String())
|
||||
}
|
||||
}
|
||||
|
||||
if t, err = dec.Time(lineprotocol.Second, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil {
|
||||
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
|
||||
}
|
||||
|
||||
time := t.Unix()
|
||||
|
||||
if Keys.Checkpoints.FileFormat != "json" {
|
||||
LineProtocolMessages <- &AvroStruct{
|
||||
MetricName: string(metricBuf),
|
||||
Cluster: cluster,
|
||||
Node: host,
|
||||
Selector: append([]string{}, selector...),
|
||||
Value: metric.Value,
|
||||
Timestamp: time,
|
||||
}
|
||||
}
|
||||
|
||||
if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
486
internal/memorystore/memorystore.go
Normal file
486
internal/memorystore/memorystore.go
Normal file
@@ -0,0 +1,486 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
"github.com/ClusterCockpit/cc-lib/runtimeEnv"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
var (
|
||||
singleton sync.Once
|
||||
msInstance *MemoryStore
|
||||
)
|
||||
|
||||
var NumWorkers int = 4
|
||||
|
||||
func init() {
|
||||
maxWorkers := 10
|
||||
NumWorkers = min(runtime.NumCPU()/2+1, maxWorkers)
|
||||
}
|
||||
|
||||
type Metric struct {
|
||||
Name string
|
||||
Value schema.Float
|
||||
MetricConfig MetricConfig
|
||||
}
|
||||
|
||||
type MemoryStore struct {
|
||||
Metrics map[string]MetricConfig
|
||||
root Level
|
||||
}
|
||||
|
||||
func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
|
||||
startupTime := time.Now()
|
||||
|
||||
if rawConfig != nil {
|
||||
config.Validate(configSchema, rawConfig)
|
||||
dec := json.NewDecoder(bytes.NewReader(rawConfig))
|
||||
// dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
for _, c := range archive.Clusters {
|
||||
for _, mc := range c.MetricConfig {
|
||||
agg, err := AssignAggregationStratergy(mc.Aggregation)
|
||||
if err != nil {
|
||||
cclog.Warnf("Could not find aggregation stratergy for metric config '%s': %s", mc.Name, err.Error())
|
||||
}
|
||||
|
||||
AddMetric(mc.Name, MetricConfig{
|
||||
Frequency: int64(mc.Timestep),
|
||||
Aggregation: agg,
|
||||
})
|
||||
}
|
||||
|
||||
for _, sc := range c.SubClusters {
|
||||
for _, mc := range sc.MetricConfig {
|
||||
agg, err := AssignAggregationStratergy(mc.Aggregation)
|
||||
if err != nil {
|
||||
cclog.Warnf("Could not find aggregation stratergy for metric config '%s': %s", mc.Name, err.Error())
|
||||
}
|
||||
|
||||
AddMetric(mc.Name, MetricConfig{
|
||||
Frequency: int64(mc.Timestep),
|
||||
Aggregation: agg,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass the config.MetricStoreKeys
|
||||
InitMetrics(Metrics)
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
d, err := time.ParseDuration(Keys.Checkpoints.Restore)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
|
||||
restoreFrom := startupTime.Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
|
||||
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
|
||||
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
|
||||
}
|
||||
|
||||
// Try to use less memory by forcing a GC run here and then
|
||||
// lowering the target percentage. The default of 100 means
|
||||
// that only once the ratio of new allocations execeds the
|
||||
// previously active heap, a GC is triggered.
|
||||
// Forcing a GC here will set the "previously active heap"
|
||||
// to a minumum.
|
||||
runtime.GC()
|
||||
|
||||
ctx, shutdown := context.WithCancel(context.Background())
|
||||
|
||||
wg.Add(4)
|
||||
|
||||
Retention(wg, ctx)
|
||||
Checkpointing(wg, ctx)
|
||||
Archiving(wg, ctx)
|
||||
DataStaging(wg, ctx)
|
||||
|
||||
wg.Add(1)
|
||||
sigs := make(chan os.Signal, 1)
|
||||
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
<-sigs
|
||||
runtimeEnv.SystemdNotifiy(false, "[METRICSTORE]> Shutting down ...")
|
||||
shutdown()
|
||||
}()
|
||||
|
||||
if Keys.Nats != nil {
|
||||
for _, natsConf := range Keys.Nats {
|
||||
// TODO: When multiple nats configs share a URL, do a single connect.
|
||||
wg.Add(1)
|
||||
nc := natsConf
|
||||
go func() {
|
||||
// err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx)
|
||||
err := ReceiveNats(nc, ms, 1, ctx)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// InitMetrics creates a new, initialized instance of a MemoryStore.
|
||||
// Will panic if values in the metric configurations are invalid.
|
||||
func InitMetrics(metrics map[string]MetricConfig) {
|
||||
singleton.Do(func() {
|
||||
offset := 0
|
||||
for key, cfg := range metrics {
|
||||
if cfg.Frequency == 0 {
|
||||
panic("[METRICSTORE]> invalid frequency")
|
||||
}
|
||||
|
||||
metrics[key] = MetricConfig{
|
||||
Frequency: cfg.Frequency,
|
||||
Aggregation: cfg.Aggregation,
|
||||
offset: offset,
|
||||
}
|
||||
offset += 1
|
||||
}
|
||||
|
||||
msInstance = &MemoryStore{
|
||||
root: Level{
|
||||
metrics: make([]*buffer, len(metrics)),
|
||||
children: make(map[string]*Level),
|
||||
},
|
||||
Metrics: metrics,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func GetMemoryStore() *MemoryStore {
|
||||
if msInstance == nil {
|
||||
cclog.Fatalf("[METRICSTORE]> MemoryStore not initialized!")
|
||||
}
|
||||
|
||||
return msInstance
|
||||
}
|
||||
|
||||
func Shutdown() {
|
||||
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
|
||||
var files int
|
||||
var err error
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
|
||||
} else {
|
||||
files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
|
||||
close(LineProtocolMessages)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
|
||||
}
|
||||
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
|
||||
|
||||
// ms.PrintHeirarchy()
|
||||
}
|
||||
|
||||
// func (m *MemoryStore) PrintHeirarchy() {
|
||||
// m.root.lock.Lock()
|
||||
// defer m.root.lock.Unlock()
|
||||
|
||||
// fmt.Printf("Root : \n")
|
||||
|
||||
// for lvl1, sel1 := range m.root.children {
|
||||
// fmt.Printf("\t%s\n", lvl1)
|
||||
// for lvl2, sel2 := range sel1.children {
|
||||
// fmt.Printf("\t\t%s\n", lvl2)
|
||||
// if lvl1 == "fritz" && lvl2 == "f0201" {
|
||||
|
||||
// for name, met := range m.Metrics {
|
||||
// mt := sel2.metrics[met.Offset]
|
||||
|
||||
// fmt.Printf("\t\t\t\t%s\n", name)
|
||||
// fmt.Printf("\t\t\t\t")
|
||||
|
||||
// for mt != nil {
|
||||
// // if name == "cpu_load" {
|
||||
// fmt.Printf("%d(%d) -> %#v", mt.start, len(mt.data), mt.data)
|
||||
// // }
|
||||
// mt = mt.prev
|
||||
// }
|
||||
// fmt.Printf("\n")
|
||||
|
||||
// }
|
||||
// }
|
||||
// for lvl3, sel3 := range sel2.children {
|
||||
// if lvl1 == "fritz" && lvl2 == "f0201" && lvl3 == "hwthread70" {
|
||||
|
||||
// fmt.Printf("\t\t\t\t\t%s\n", lvl3)
|
||||
|
||||
// for name, met := range m.Metrics {
|
||||
// mt := sel3.metrics[met.Offset]
|
||||
|
||||
// fmt.Printf("\t\t\t\t\t\t%s\n", name)
|
||||
|
||||
// fmt.Printf("\t\t\t\t\t\t")
|
||||
|
||||
// for mt != nil {
|
||||
// // if name == "clock" {
|
||||
// fmt.Printf("%d(%d) -> %#v", mt.start, len(mt.data), mt.data)
|
||||
|
||||
// mt = mt.prev
|
||||
// }
|
||||
// fmt.Printf("\n")
|
||||
|
||||
// }
|
||||
|
||||
// // for i, _ := range sel3.metrics {
|
||||
// // fmt.Printf("\t\t\t\t\t%s\n", getName(configmetrics, i))
|
||||
// // }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// }
|
||||
|
||||
func getName(m *MemoryStore, i int) string {
|
||||
for key, val := range m.Metrics {
|
||||
if val.offset == i {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func Retention(wg *sync.WaitGroup, ctx context.Context) {
|
||||
ms := GetMemoryStore()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.RetentionInMemory)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
d := d / 2
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
t := time.Now().Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
|
||||
freed, err := ms.Free(nil, t.Unix())
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
|
||||
// Look at `findLevelOrCreate` for how selectors work.
|
||||
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
|
||||
var ok bool
|
||||
for i, metric := range metrics {
|
||||
if metric.MetricConfig.Frequency == 0 {
|
||||
metric.MetricConfig, ok = m.Metrics[metric.Name]
|
||||
if !ok {
|
||||
metric.MetricConfig.Frequency = 0
|
||||
}
|
||||
metrics[i] = metric
|
||||
}
|
||||
}
|
||||
|
||||
return m.WriteToLevel(&m.root, selector, ts, metrics)
|
||||
}
|
||||
|
||||
func (m *MemoryStore) GetLevel(selector []string) *Level {
|
||||
return m.root.findLevelOrCreate(selector, len(m.Metrics))
|
||||
}
|
||||
|
||||
// WriteToLevel assumes that `minfo` in `metrics` is filled in
|
||||
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
|
||||
l = l.findLevelOrCreate(selector, len(m.Metrics))
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
for _, metric := range metrics {
|
||||
if metric.MetricConfig.Frequency == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
b := l.metrics[metric.MetricConfig.offset]
|
||||
if b == nil {
|
||||
// First write to this metric and level
|
||||
b = newBuffer(ts, metric.MetricConfig.Frequency)
|
||||
l.metrics[metric.MetricConfig.offset] = b
|
||||
}
|
||||
|
||||
nb, err := b.write(ts, metric.Value)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Last write created a new buffer...
|
||||
if b != nb {
|
||||
l.metrics[metric.MetricConfig.offset] = nb
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
|
||||
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
|
||||
// The second and third return value are the actual from/to for the data. Those can be different from
|
||||
// the range asked for if no data was available.
|
||||
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
|
||||
if from > to {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
|
||||
}
|
||||
|
||||
minfo, ok := m.Metrics[metric]
|
||||
if !ok {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: " + metric)
|
||||
}
|
||||
|
||||
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
|
||||
|
||||
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
|
||||
cdata, cfrom, cto, err := b.read(from, to, data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
from, to = cfrom, cto
|
||||
} else if from != cfrom || to != cto || len(data) != len(cdata) {
|
||||
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
|
||||
if missingfront != 0 {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
newlen := len(cdata) - missingback
|
||||
if newlen < 1 {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
cdata = cdata[0:newlen]
|
||||
if len(cdata) != len(data) {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
from, to = cfrom, cto
|
||||
}
|
||||
|
||||
data = cdata
|
||||
n += 1
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, err
|
||||
} else if n == 0 {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
|
||||
} else if n > 1 {
|
||||
if minfo.Aggregation == AvgAggregation {
|
||||
normalize := 1. / schema.Float(n)
|
||||
for i := 0; i < len(data); i++ {
|
||||
data[i] *= normalize
|
||||
}
|
||||
} else if minfo.Aggregation != SumAggregation {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
|
||||
}
|
||||
}
|
||||
|
||||
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, err
|
||||
}
|
||||
|
||||
return data, from, to, resolution, nil
|
||||
}
|
||||
|
||||
// Free releases all buffers for the selected level and all its children that
|
||||
// contain only values older than `t`.
|
||||
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
|
||||
return m.GetLevel(selector).free(t)
|
||||
}
|
||||
|
||||
func (m *MemoryStore) FreeAll() error {
|
||||
for k := range m.root.children {
|
||||
delete(m.root.children, k)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) SizeInBytes() int64 {
|
||||
return m.root.sizeInBytes()
|
||||
}
|
||||
|
||||
// ListChildren , given a selector, returns a list of all children of the level
|
||||
// selected.
|
||||
func (m *MemoryStore) ListChildren(selector []string) []string {
|
||||
lvl := &m.root
|
||||
for lvl != nil && len(selector) != 0 {
|
||||
lvl.lock.RLock()
|
||||
next := lvl.children[selector[0]]
|
||||
lvl.lock.RUnlock()
|
||||
lvl = next
|
||||
selector = selector[1:]
|
||||
}
|
||||
|
||||
if lvl == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
lvl.lock.RLock()
|
||||
defer lvl.lock.RUnlock()
|
||||
|
||||
children := make([]string, 0, len(lvl.children))
|
||||
for child := range lvl.children {
|
||||
children = append(children, child)
|
||||
}
|
||||
|
||||
return children
|
||||
}
|
||||
124
internal/memorystore/stats.go
Normal file
124
internal/memorystore/stats.go
Normal file
@@ -0,0 +1,124 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
type Stats struct {
|
||||
Samples int
|
||||
Avg util.Float
|
||||
Min util.Float
|
||||
Max util.Float
|
||||
}
|
||||
|
||||
func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) {
|
||||
if from < b.start {
|
||||
if b.prev != nil {
|
||||
return b.prev.stats(from, to)
|
||||
}
|
||||
from = b.start
|
||||
}
|
||||
|
||||
// TODO: Check if b.closed and if so and the full buffer is queried,
|
||||
// use b.statistics instead of iterating over the buffer.
|
||||
|
||||
samples := 0
|
||||
sum, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
|
||||
var t int64
|
||||
for t = from; t < to; t += b.frequency {
|
||||
idx := int((t - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
b = b.next
|
||||
if b == nil {
|
||||
break
|
||||
}
|
||||
idx = 0
|
||||
}
|
||||
|
||||
if t < b.start || idx >= len(b.data) {
|
||||
continue
|
||||
}
|
||||
|
||||
xf := float64(b.data[idx])
|
||||
if math.IsNaN(xf) {
|
||||
continue
|
||||
}
|
||||
|
||||
samples += 1
|
||||
sum += xf
|
||||
min = math.Min(min, xf)
|
||||
max = math.Max(max, xf)
|
||||
}
|
||||
|
||||
return Stats{
|
||||
Samples: samples,
|
||||
Avg: util.Float(sum) / util.Float(samples),
|
||||
Min: util.Float(min),
|
||||
Max: util.Float(max),
|
||||
}, from, t, nil
|
||||
}
|
||||
|
||||
// Returns statistics for the requested metric on the selected node/level.
|
||||
// Data is aggregated to the selected level the same way as in `MemoryStore.Read`.
|
||||
// If `Stats.Samples` is zero, the statistics should not be considered as valid.
|
||||
func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int64) (*Stats, int64, int64, error) {
|
||||
if from > to {
|
||||
return nil, 0, 0, errors.New("invalid time range")
|
||||
}
|
||||
|
||||
minfo, ok := m.Metrics[metric]
|
||||
if !ok {
|
||||
return nil, 0, 0, errors.New("unkown metric: " + metric)
|
||||
}
|
||||
|
||||
n, samples := 0, 0
|
||||
avg, min, max := util.Float(0), math.MaxFloat32, -math.MaxFloat32
|
||||
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
|
||||
stats, cfrom, cto, err := b.stats(from, to)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
from, to = cfrom, cto
|
||||
} else if from != cfrom || to != cto {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
samples += stats.Samples
|
||||
avg += stats.Avg
|
||||
min = math.Min(min, float64(stats.Min))
|
||||
max = math.Max(max, float64(stats.Max))
|
||||
n += 1
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
return nil, 0, 0, ErrNoData
|
||||
}
|
||||
|
||||
if minfo.Aggregation == AvgAggregation {
|
||||
avg /= util.Float(n)
|
||||
} else if n > 1 && minfo.Aggregation != SumAggregation {
|
||||
return nil, 0, 0, errors.New("invalid aggregation")
|
||||
}
|
||||
|
||||
return &Stats{
|
||||
Samples: samples,
|
||||
Avg: avg,
|
||||
Min: util.Float(min),
|
||||
Max: util.Float(max),
|
||||
}, from, to, nil
|
||||
}
|
||||
382
internal/metricDataDispatcher/dataLoader.go
Normal file
382
internal/metricDataDispatcher/dataLoader.go
Normal file
@@ -0,0 +1,382 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package metricDataDispatcher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/lrucache"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
func cacheKey(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) string {
|
||||
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
|
||||
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
|
||||
job.ID, job.State, metrics, scopes, resolution)
|
||||
}
|
||||
|
||||
// Fetches the metric data for a job.
|
||||
func LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
||||
config.Keys.DisableArchive {
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
|
||||
}
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := archive.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = repo.LoadData(job, metrics, scopes, ctx, resolution)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
cclog.Warnf("partial error: %s", err.Error())
|
||||
// return err, 0, 0 // Reactivating will block archiving on one partial error
|
||||
} else {
|
||||
cclog.Error("Error while loading job data from metric repository")
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
var jd_temp schema.JobData
|
||||
jd_temp, err = archive.GetHandle().LoadJobData(job)
|
||||
if err != nil {
|
||||
cclog.Error("Error while loading job data from archive")
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
// Deep copy the cached archive hashmap
|
||||
jd = metricdata.DeepCopy(jd_temp)
|
||||
|
||||
// Resampling for archived data.
|
||||
// Pass the resolution from frontend here.
|
||||
for _, v := range jd {
|
||||
for _, v_ := range v {
|
||||
timestep := int64(0)
|
||||
for i := 0; i < len(v_.Series); i += 1 {
|
||||
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
|
||||
if err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
v_.Timestep = int(timestep)
|
||||
}
|
||||
}
|
||||
|
||||
// Avoid sending unrequested data to the client:
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = jd.Size()
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
// FIXME: Review: Is this really necessary or correct.
|
||||
// Note: Lines 147-170 formerly known as prepareJobData(jobData, scopes)
|
||||
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
|
||||
// to be available at the scope 'node'. If a job has a lot of nodes,
|
||||
// statisticsSeries should be available so that a min/median/max Graph can be
|
||||
// used instead of a lot of single lines.
|
||||
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
|
||||
// Existing (archived) StatsSeries can be 'min/mean/max'!
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jd.AddNodeScope("flops_any")
|
||||
jd.AddNodeScope("mem_bw")
|
||||
}
|
||||
|
||||
// Round Resulting Stat Values
|
||||
jd.RoundMetricStats()
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
cclog.Error("Error in returned dataset")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
||||
func LoadAverages(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
data [][]schema.Float,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
|
||||
}
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Used for statsTable in frontend: Return scoped statistics by metric.
|
||||
func LoadScopedJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
|
||||
}
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
|
||||
}
|
||||
|
||||
scopedStats, err := repo.LoadScopedStats(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while loading scoped statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return scopedStats, nil
|
||||
}
|
||||
|
||||
// Used for polar plots in frontend: Aggregates statistics for all nodes to single values for job per metric.
|
||||
func LoadJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.MetricStatistics, error) {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadStatsFromArchive(job, metrics)
|
||||
}
|
||||
|
||||
data := make(map[string]schema.MetricStatistics, len(metrics))
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return data, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while loading statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
|
||||
return data, err
|
||||
}
|
||||
|
||||
for _, m := range metrics {
|
||||
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
|
||||
continue
|
||||
}
|
||||
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
min = math.Min(min, node.Min)
|
||||
max = math.Max(max, node.Max)
|
||||
}
|
||||
|
||||
data[m] = schema.MetricStatistics{
|
||||
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
|
||||
Min: (math.Round(min*100) / 100),
|
||||
Max: (math.Round(max*100) / 100),
|
||||
}
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Used for the classic node/system view. Returns a map of nodes to a map of metrics.
|
||||
func LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
repo, err := metricdata.GetMetricDataRepo(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
cclog.Error("Error while loading node data from metric repository")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func LoadNodeListData(
|
||||
cluster, subCluster, nodeFilter string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
page *model.PageRequest,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, int, bool, error) {
|
||||
repo, err := metricdata.GetMetricDataRepo(cluster)
|
||||
if err != nil {
|
||||
return nil, 0, false, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, totalNodes, hasNextPage, err := repo.LoadNodeListData(cluster, subCluster, nodeFilter, metrics, scopes, resolution, from, to, page, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
cclog.Error("Error while loading node data from metric repository")
|
||||
return nil, totalNodes, hasNextPage, err
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
|
||||
const maxSeriesSize int = 8
|
||||
for _, jd := range data {
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, totalNodes, hasNextPage, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, totalNodes, hasNextPage, nil
|
||||
}
|
||||
1153
internal/metricdata/cc-metric-store-internal.go
Normal file
1153
internal/metricdata/cc-metric-store-internal.go
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,313 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
|
||||
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
|
||||
)
|
||||
|
||||
type InfluxDBv2DataRepositoryConfig struct {
|
||||
Url string `json:"url"`
|
||||
Token string `json:"token"`
|
||||
Bucket string `json:"bucket"`
|
||||
Org string `json:"org"`
|
||||
SkipTls bool `json:"skiptls"`
|
||||
}
|
||||
|
||||
type InfluxDBv2DataRepository struct {
|
||||
client influxdb2.Client
|
||||
queryClient influxdb2Api.QueryAPI
|
||||
bucket, measurement string
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) Init(rawConfig json.RawMessage) error {
|
||||
var config InfluxDBv2DataRepositoryConfig
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
log.Warn("Error while unmarshaling raw json config")
|
||||
return err
|
||||
}
|
||||
|
||||
idb.client = influxdb2.NewClientWithOptions(config.Url, config.Token, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: config.SkipTls}))
|
||||
idb.queryClient = idb.client.QueryAPI(config.Org)
|
||||
idb.bucket = config.Bucket
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string {
|
||||
return t.Format(time.RFC3339) // Like “2006-01-02T15:04:05Z07:00”
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) epochToTime(epoch int64) time.Time {
|
||||
return time.Unix(epoch, 0)
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
|
||||
measurementsConds := make([]string, 0, len(metrics))
|
||||
for _, m := range metrics {
|
||||
measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m))
|
||||
}
|
||||
measurementsCond := strings.Join(measurementsConds, " or ")
|
||||
|
||||
hostsConds := make([]string, 0, len(job.Resources))
|
||||
for _, h := range job.Resources {
|
||||
if h.HWThreads != nil || h.Accelerators != nil {
|
||||
// TODO
|
||||
return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
|
||||
}
|
||||
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
|
||||
}
|
||||
hostsCond := strings.Join(hostsConds, " or ")
|
||||
|
||||
jobData := make(schema.JobData) // Empty Schema: map[<string>FIELD]map[<MetricScope>SCOPE]<*JobMetric>METRIC
|
||||
// Requested Scopes
|
||||
for _, scope := range scopes {
|
||||
query := ""
|
||||
switch scope {
|
||||
case "node":
|
||||
// Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows
|
||||
// log.Info("Scope 'node' requested. ")
|
||||
query = fmt.Sprintf(`
|
||||
from(bucket: "%s")
|
||||
|> range(start: %s, stop: %s)
|
||||
|> filter(fn: (r) => (%s) and (%s) )
|
||||
|> drop(columns: ["_start", "_stop"])
|
||||
|> group(columns: ["hostname", "_measurement"])
|
||||
|> aggregateWindow(every: 60s, fn: mean)
|
||||
|> drop(columns: ["_time"])`,
|
||||
idb.bucket,
|
||||
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
|
||||
measurementsCond, hostsCond)
|
||||
case "socket":
|
||||
log.Info("Scope 'socket' requested, but not yet supported: Will return 'node' scope only. ")
|
||||
continue
|
||||
case "core":
|
||||
log.Info(" Scope 'core' requested, but not yet supported: Will return 'node' scope only. ")
|
||||
continue
|
||||
// Get Finest Granularity only, Set NULL to 0.0
|
||||
// query = fmt.Sprintf(`
|
||||
// from(bucket: "%s")
|
||||
// |> range(start: %s, stop: %s)
|
||||
// |> filter(fn: (r) => %s )
|
||||
// |> filter(fn: (r) => %s )
|
||||
// |> drop(columns: ["_start", "_stop", "cluster"])
|
||||
// |> map(fn: (r) => (if exists r._value then {r with _value: r._value} else {r with _value: 0.0}))`,
|
||||
// idb.bucket,
|
||||
// idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )),
|
||||
// measurementsCond, hostsCond)
|
||||
default:
|
||||
log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope)
|
||||
continue
|
||||
// return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support other scopes than 'node'")
|
||||
}
|
||||
|
||||
rows, err := idb.queryClient.Query(ctx, query)
|
||||
if err != nil {
|
||||
log.Error("Error while performing query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Init Metrics: Only Node level now -> TODO: Matching /check on scope level ...
|
||||
for _, metric := range metrics {
|
||||
jobMetric, ok := jobData[metric]
|
||||
if !ok {
|
||||
mc := archive.GetMetricConfig(job.Cluster, metric)
|
||||
jobMetric = map[schema.MetricScope]*schema.JobMetric{
|
||||
scope: { // uses scope var from above!
|
||||
Unit: mc.Unit,
|
||||
Timestep: mc.Timestep,
|
||||
Series: make([]schema.Series, 0, len(job.Resources)),
|
||||
StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
|
||||
},
|
||||
}
|
||||
}
|
||||
jobData[metric] = jobMetric
|
||||
}
|
||||
|
||||
// Process Result: Time-Data
|
||||
field, host, hostSeries := "", "", schema.Series{}
|
||||
// typeId := 0
|
||||
switch scope {
|
||||
case "node":
|
||||
for rows.Next() {
|
||||
row := rows.Record()
|
||||
if host == "" || host != row.ValueByKey("hostname").(string) || rows.TableChanged() {
|
||||
if host != "" {
|
||||
// Append Series before reset
|
||||
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
|
||||
}
|
||||
field, host = row.Measurement(), row.ValueByKey("hostname").(string)
|
||||
hostSeries = schema.Series{
|
||||
Hostname: host,
|
||||
Statistics: schema.MetricStatistics{}, //TODO Add Statistics
|
||||
Data: make([]schema.Float, 0),
|
||||
}
|
||||
}
|
||||
val, ok := row.Value().(float64)
|
||||
if ok {
|
||||
hostSeries.Data = append(hostSeries.Data, schema.Float(val))
|
||||
} else {
|
||||
hostSeries.Data = append(hostSeries.Data, schema.Float(0))
|
||||
}
|
||||
}
|
||||
case "socket":
|
||||
continue
|
||||
case "core":
|
||||
continue
|
||||
// Include Series.Id in hostSeries
|
||||
// for rows.Next() {
|
||||
// row := rows.Record()
|
||||
// if ( host == "" || host != row.ValueByKey("hostname").(string) || typeId != row.ValueByKey("type-id").(int) || rows.TableChanged() ) {
|
||||
// if ( host != "" ) {
|
||||
// // Append Series before reset
|
||||
// jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
|
||||
// }
|
||||
// field, host, typeId = row.Measurement(), row.ValueByKey("hostname").(string), row.ValueByKey("type-id").(int)
|
||||
// hostSeries = schema.Series{
|
||||
// Hostname: host,
|
||||
// Id: &typeId,
|
||||
// Statistics: nil,
|
||||
// Data: make([]schema.Float, 0),
|
||||
// }
|
||||
// }
|
||||
// val := row.Value().(float64)
|
||||
// hostSeries.Data = append(hostSeries.Data, schema.Float(val))
|
||||
// }
|
||||
default:
|
||||
log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope)
|
||||
continue
|
||||
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node, core'")
|
||||
}
|
||||
// Append last Series
|
||||
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
|
||||
}
|
||||
|
||||
// Get Stats
|
||||
stats, err := idb.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading statistics")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, scope := range scopes {
|
||||
if scope == "node" { // No 'socket/core' support yet
|
||||
for metric, nodes := range stats {
|
||||
for node, stats := range nodes {
|
||||
for index, _ := range jobData[metric][scope].Series {
|
||||
if jobData[metric][scope].Series[index].Hostname == node {
|
||||
jobData[metric][scope].Series[index].Statistics = schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
|
||||
stats := map[string]map[string]schema.MetricStatistics{}
|
||||
|
||||
hostsConds := make([]string, 0, len(job.Resources))
|
||||
for _, h := range job.Resources {
|
||||
if h.HWThreads != nil || h.Accelerators != nil {
|
||||
// TODO
|
||||
return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
|
||||
}
|
||||
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
|
||||
}
|
||||
hostsCond := strings.Join(hostsConds, " or ")
|
||||
|
||||
// lenMet := len(metrics)
|
||||
|
||||
for _, metric := range metrics {
|
||||
// log.Debugf("<< You are here: %s (Index %d of %d metrics)", metric, index, lenMet)
|
||||
|
||||
query := fmt.Sprintf(`
|
||||
data = from(bucket: "%s")
|
||||
|> range(start: %s, stop: %s)
|
||||
|> filter(fn: (r) => r._measurement == "%s" and r._field == "value" and (%s))
|
||||
union(tables: [data |> mean(column: "_value") |> set(key: "_field", value: "avg"),
|
||||
data |> min(column: "_value") |> set(key: "_field", value: "min"),
|
||||
data |> max(column: "_value") |> set(key: "_field", value: "max")])
|
||||
|> pivot(rowKey: ["hostname"], columnKey: ["_field"], valueColumn: "_value")
|
||||
|> group()`,
|
||||
idb.bucket,
|
||||
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
|
||||
metric, hostsCond)
|
||||
|
||||
rows, err := idb.queryClient.Query(ctx, query)
|
||||
if err != nil {
|
||||
log.Error("Error while performing query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodes := map[string]schema.MetricStatistics{}
|
||||
for rows.Next() {
|
||||
row := rows.Record()
|
||||
host := row.ValueByKey("hostname").(string)
|
||||
|
||||
avg, avgok := row.ValueByKey("avg").(float64)
|
||||
if !avgok {
|
||||
// log.Debugf(">> Assertion error for metric %s, statistic AVG. Expected 'float64', got %v", metric, avg)
|
||||
avg = 0.0
|
||||
}
|
||||
min, minok := row.ValueByKey("min").(float64)
|
||||
if !minok {
|
||||
// log.Debugf(">> Assertion error for metric %s, statistic MIN. Expected 'float64', got %v", metric, min)
|
||||
min = 0.0
|
||||
}
|
||||
max, maxok := row.ValueByKey("max").(float64)
|
||||
if !maxok {
|
||||
// log.Debugf(">> Assertion error for metric %s, statistic MAX. Expected 'float64', got %v", metric, max)
|
||||
max = 0.0
|
||||
}
|
||||
|
||||
nodes[host] = schema.MetricStatistics{
|
||||
Avg: avg,
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
stats[metric] = nodes
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
|
||||
// TODO : Implement to be used in Analysis- und System/Node-View
|
||||
log.Infof("LoadNodeData unimplemented for InfluxDBv2DataRepository, Args: cluster %s, metrics %v, nodes %v, scopes %v", cluster, metrics, nodes, scopes)
|
||||
|
||||
return nil, errors.New("METRICDATA/INFLUXV2 > unimplemented for InfluxDBv2DataRepository")
|
||||
}
|
||||
@@ -1,21 +1,21 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
type MetricDataRepository interface {
|
||||
@@ -24,28 +24,31 @@ type MetricDataRepository interface {
|
||||
Init(rawConfig json.RawMessage) error
|
||||
|
||||
// Return the JobData for the given job, only with the requested metrics.
|
||||
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
|
||||
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
|
||||
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
|
||||
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics at the requested scopes for that node.
|
||||
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
|
||||
LoadScopedStats(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.ScopedJobStats, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
|
||||
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
|
||||
LoadNodeListData(cluster, subCluster, nodeFilter string, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, page *model.PageRequest, ctx context.Context) (map[string]schema.JobData, int, bool, error)
|
||||
}
|
||||
|
||||
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
|
||||
|
||||
var useArchive bool
|
||||
|
||||
func Init(disableArchive bool) error {
|
||||
useArchive = !disableArchive
|
||||
for _, cluster := range config.Keys.Clusters {
|
||||
func Init() error {
|
||||
for _, cluster := range config.Clusters {
|
||||
if cluster.MetricDataRepository != nil {
|
||||
var kind struct {
|
||||
Kind string `json:"kind"`
|
||||
}
|
||||
if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
|
||||
log.Warn("Error while unmarshaling raw json MetricDataRepository")
|
||||
cclog.Warn("Error while unmarshaling raw json MetricDataRepository")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -53,8 +56,9 @@ func Init(disableArchive bool) error {
|
||||
switch kind.Kind {
|
||||
case "cc-metric-store":
|
||||
mdr = &CCMetricStore{}
|
||||
case "influxdb":
|
||||
mdr = &InfluxDBv2DataRepository{}
|
||||
case "cc-metric-store-internal":
|
||||
mdr = &CCMetricStoreInternal{}
|
||||
memorystore.InternalCCMSFlag = true
|
||||
case "prometheus":
|
||||
mdr = &PrometheusDataRepository{}
|
||||
case "test":
|
||||
@@ -64,7 +68,7 @@ func Init(disableArchive bool) error {
|
||||
}
|
||||
|
||||
if err := mdr.Init(cluster.MetricDataRepository); err != nil {
|
||||
log.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
|
||||
cclog.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
|
||||
return err
|
||||
}
|
||||
metricDataRepos[cluster.Name] = mdr
|
||||
@@ -73,284 +77,13 @@ func Init(disableArchive bool) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
// Fetches the metric data for a job.
|
||||
func LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
||||
!useArchive {
|
||||
|
||||
repo, ok := metricDataRepos[job.Cluster]
|
||||
|
||||
if !ok {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
|
||||
}
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := archive.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = repo.LoadData(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
log.Warnf("partial error: %s", err.Error())
|
||||
// return err, 0, 0 // Reactivating will block archiving on one partial error
|
||||
} else {
|
||||
log.Error("Error while loading job data from metric repository")
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
jd, err = archive.GetHandle().LoadJobData(job)
|
||||
if err != nil {
|
||||
log.Error("Error while loading job data from archive")
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
// Avoid sending unrequested data to the client:
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = jd.Size()
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
prepareJobData(job, jd, scopes)
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
log.Error("Error in returned dataset")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
||||
func LoadAverages(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
data [][]schema.Float,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
if job.State != schema.JobStateRunning && useArchive {
|
||||
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
|
||||
}
|
||||
|
||||
repo, ok := metricDataRepos[job.Cluster]
|
||||
if !ok {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
|
||||
if err != nil {
|
||||
log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Used for the node/system view. Returns a map of nodes to a map of metrics.
|
||||
func LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
|
||||
var err error
|
||||
repo, ok := metricDataRepos[cluster]
|
||||
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
log.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
log.Error("Error while loading node data from metric repository")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func cacheKey(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
) string {
|
||||
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
|
||||
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]",
|
||||
job.ID, job.State, metrics, scopes)
|
||||
}
|
||||
|
||||
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
|
||||
// to be available at the scope 'node'. If a job has a lot of nodes,
|
||||
// statisticsSeries should be available so that a min/mean/max Graph can be
|
||||
// used instead of a lot of single lines.
|
||||
func prepareJobData(
|
||||
job *schema.Job,
|
||||
jobData schema.JobData,
|
||||
scopes []schema.MetricScope,
|
||||
) {
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jobData {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jobData.AddNodeScope("flops_any")
|
||||
jobData.AddNodeScope("mem_bw")
|
||||
}
|
||||
}
|
||||
|
||||
// Writes a running job to the job-archive
|
||||
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
allMetrics := make([]string, 0)
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
for _, mc := range metricConfigs {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
// TODO: Talk about this! What resolutions to store data at...
|
||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||
if job.NumNodes <= 8 {
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
}
|
||||
|
||||
jobData, err := LoadData(job, allMetrics, scopes, ctx)
|
||||
if err != nil {
|
||||
log.Error("Error wile loading job data for archiving")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobMeta := &schema.JobMeta{
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
Statistics: make(map[string]schema.JobStatistics),
|
||||
}
|
||||
|
||||
for metric, data := range jobData {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// TODO/FIXME: Calc average for non-node metrics as well!
|
||||
continue
|
||||
}
|
||||
|
||||
for _, series := range nodeData.Series {
|
||||
avg += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
}
|
||||
|
||||
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: schema.Unit{
|
||||
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||
},
|
||||
Avg: avg / float64(job.NumNodes),
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
|
||||
// If the file based archive is disabled,
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if !useArchive {
|
||||
return jobMeta, nil
|
||||
}
|
||||
|
||||
return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData)
|
||||
return repo, err
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) 2022 DKRZ
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package metricdata
|
||||
@@ -20,9 +20,10 @@ import (
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
promapi "github.com/prometheus/client_golang/api"
|
||||
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||
promcfg "github.com/prometheus/common/config"
|
||||
@@ -159,17 +160,17 @@ func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
|
||||
var config PrometheusDataRepositoryConfig
|
||||
// parse config
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
log.Warn("Error while unmarshaling raw json config")
|
||||
cclog.Warn("Error while unmarshaling raw json config")
|
||||
return err
|
||||
}
|
||||
// support basic authentication
|
||||
var rt http.RoundTripper = nil
|
||||
if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
|
||||
prom_pw := promcfg.Secret(prom_pw)
|
||||
rt = promcfg.NewBasicAuthRoundTripper(config.Username, prom_pw, "", promapi.DefaultRoundTripper)
|
||||
rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
|
||||
} else {
|
||||
if config.Username != "" {
|
||||
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set.")
|
||||
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
|
||||
}
|
||||
}
|
||||
// init client
|
||||
@@ -178,7 +179,7 @@ func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
|
||||
RoundTripper: rt,
|
||||
})
|
||||
if err != nil {
|
||||
log.Error("Error while initializing new prometheus client")
|
||||
cclog.Error("Error while initializing new prometheus client")
|
||||
return err
|
||||
}
|
||||
// init query client
|
||||
@@ -191,9 +192,9 @@ func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
|
||||
for metric, templ := range config.Templates {
|
||||
pdb.templates[metric], err = template.New(metric).Parse(templ)
|
||||
if err == nil {
|
||||
log.Debugf("Added PromQL template for %s: %s", metric, templ)
|
||||
cclog.Debugf("Added PromQL template for %s: %s", metric, templ)
|
||||
} else {
|
||||
log.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
|
||||
cclog.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
@@ -204,8 +205,8 @@ func (pdb *PrometheusDataRepository) FormatQuery(
|
||||
metric string,
|
||||
scope schema.MetricScope,
|
||||
nodes []string,
|
||||
cluster string) (string, error) {
|
||||
|
||||
cluster string,
|
||||
) (string, error) {
|
||||
args := PromQLArgs{}
|
||||
if len(nodes) > 0 {
|
||||
args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
|
||||
@@ -220,7 +221,7 @@ func (pdb *PrometheusDataRepository) FormatQuery(
|
||||
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > Error compiling template %v", templ))
|
||||
} else {
|
||||
query := buf.String()
|
||||
log.Debugf("PromQL: %s", query)
|
||||
cclog.Debugf("PromQL: %s", query)
|
||||
return query, nil
|
||||
}
|
||||
} else {
|
||||
@@ -233,12 +234,13 @@ func (pdb *PrometheusDataRepository) RowToSeries(
|
||||
from time.Time,
|
||||
step int64,
|
||||
steps int64,
|
||||
row *promm.SampleStream) schema.Series {
|
||||
row *promm.SampleStream,
|
||||
) schema.Series {
|
||||
ts := from.Unix()
|
||||
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
|
||||
// init array of expected length with NaN
|
||||
values := make([]schema.Float, steps+1)
|
||||
for i, _ := range values {
|
||||
for i := range values {
|
||||
values[i] = schema.NaN
|
||||
}
|
||||
// copy recorded values from prom sample pair
|
||||
@@ -263,8 +265,9 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
// TODO respect requested scope
|
||||
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
@@ -276,13 +279,13 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
for i, resource := range job.Resources {
|
||||
nodes[i] = resource.Hostname
|
||||
}
|
||||
from := job.StartTime
|
||||
to := job.StartTime.Add(time.Duration(job.Duration) * time.Second)
|
||||
from := time.Unix(job.StartTime, 0)
|
||||
to := time.Unix(job.StartTime+int64(job.Duration), 0)
|
||||
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
log.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
cclog.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
@@ -290,12 +293,12 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
for _, metric := range metrics {
|
||||
metricConfig := archive.GetMetricConfig(job.Cluster, metric)
|
||||
if metricConfig == nil {
|
||||
log.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
|
||||
cclog.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
|
||||
return nil, errors.New("Prometheus config error")
|
||||
}
|
||||
query, err := pdb.FormatQuery(metric, scope, nodes, job.Cluster)
|
||||
if err != nil {
|
||||
log.Warn("Error while formatting prometheus query")
|
||||
cclog.Warn("Error while formatting prometheus query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -306,13 +309,12 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
|
||||
cclog.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
log.Warnf("Warnings: %v\n", warnings)
|
||||
cclog.Warnf("Warnings: %v\n", warnings)
|
||||
}
|
||||
|
||||
// init data structures
|
||||
@@ -335,7 +337,7 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
pdb.RowToSeries(from, step, steps, row))
|
||||
}
|
||||
// only add metric if at least one host returned data
|
||||
if !ok && len(jobMetric.Series) > 0{
|
||||
if !ok && len(jobMetric.Series) > 0 {
|
||||
jobData[metric][scope] = jobMetric
|
||||
}
|
||||
// sort by hostname to get uniform coloring
|
||||
@@ -351,14 +353,14 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
func (pdb *PrometheusDataRepository) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
|
||||
ctx context.Context,
|
||||
) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
// map of metrics of nodes of stats
|
||||
stats := map[string]map[string]schema.MetricStatistics{}
|
||||
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx)
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading job for stats")
|
||||
cclog.Warn("Error while loading job for stats")
|
||||
return nil, err
|
||||
}
|
||||
for metric, metricData := range data {
|
||||
@@ -376,7 +378,8 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
t0 := time.Now()
|
||||
// Map of hosts of metrics of value slices
|
||||
data := make(map[string]map[string][]*schema.JobMetric)
|
||||
@@ -388,19 +391,19 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
log.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
metricConfig := archive.GetMetricConfig(cluster, metric)
|
||||
if metricConfig == nil {
|
||||
log.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
|
||||
cclog.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
|
||||
return nil, errors.New("Prometheus config error")
|
||||
}
|
||||
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
|
||||
if err != nil {
|
||||
log.Warn("Error while formatting prometheus query")
|
||||
cclog.Warn("Error while formatting prometheus query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -411,13 +414,12 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
|
||||
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
log.Warnf("Warnings: %v\n", warnings)
|
||||
cclog.Warnf("Warnings: %v\n", warnings)
|
||||
}
|
||||
|
||||
step := int64(metricConfig.Timestep)
|
||||
@@ -442,6 +444,190 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
}
|
||||
}
|
||||
t1 := time.Since(t0)
|
||||
log.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
|
||||
cclog.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Implemented by NHR@FAU; Used in Job-View StatsTable
|
||||
func (pdb *PrometheusDataRepository) LoadScopedStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
// Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable
|
||||
scopedJobStats := make(schema.ScopedJobStats)
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while loading job for scopedJobStats")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for metric, metricData := range data {
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := scopedJobStats[metric]; !ok {
|
||||
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
|
||||
}
|
||||
|
||||
if _, ok := scopedJobStats[metric][scope]; !ok {
|
||||
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
|
||||
}
|
||||
|
||||
for _, series := range metricData[scope].Series {
|
||||
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
|
||||
Hostname: series.Hostname,
|
||||
Data: &series.Statistics,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return scopedJobStats, nil
|
||||
}
|
||||
|
||||
// Implemented by NHR@FAU; Used in NodeList-View
|
||||
func (pdb *PrometheusDataRepository) LoadNodeListData(
|
||||
cluster, subCluster, nodeFilter string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
page *model.PageRequest,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, int, bool, error) {
|
||||
// Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList
|
||||
|
||||
// 0) Init additional vars
|
||||
var totalNodes int = 0
|
||||
var hasNextPage bool = false
|
||||
|
||||
// 1) Get list of all nodes
|
||||
var nodes []string
|
||||
if subCluster != "" {
|
||||
scNodes := archive.NodeLists[cluster][subCluster]
|
||||
nodes = scNodes.PrintList()
|
||||
} else {
|
||||
subClusterNodeLists := archive.NodeLists[cluster]
|
||||
for _, nodeList := range subClusterNodeLists {
|
||||
nodes = append(nodes, nodeList.PrintList()...)
|
||||
}
|
||||
}
|
||||
|
||||
// 2) Filter nodes
|
||||
if nodeFilter != "" {
|
||||
filteredNodes := []string{}
|
||||
for _, node := range nodes {
|
||||
if strings.Contains(node, nodeFilter) {
|
||||
filteredNodes = append(filteredNodes, node)
|
||||
}
|
||||
}
|
||||
nodes = filteredNodes
|
||||
}
|
||||
|
||||
// 2.1) Count total nodes && Sort nodes -> Sorting invalidated after return ...
|
||||
totalNodes = len(nodes)
|
||||
sort.Strings(nodes)
|
||||
|
||||
// 3) Apply paging
|
||||
if len(nodes) > page.ItemsPerPage {
|
||||
start := (page.Page - 1) * page.ItemsPerPage
|
||||
end := start + page.ItemsPerPage
|
||||
if end >= len(nodes) {
|
||||
end = len(nodes)
|
||||
hasNextPage = false
|
||||
} else {
|
||||
hasNextPage = true
|
||||
}
|
||||
nodes = nodes[start:end]
|
||||
}
|
||||
|
||||
// 4) Fetch Data, based on pdb.LoadNodeData()
|
||||
|
||||
t0 := time.Now()
|
||||
// Map of hosts of jobData
|
||||
data := make(map[string]schema.JobData)
|
||||
|
||||
// query db for each metric
|
||||
// TODO: scopes seems to be always empty
|
||||
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
for _, metric := range metrics {
|
||||
metricConfig := archive.GetMetricConfig(cluster, metric)
|
||||
if metricConfig == nil {
|
||||
cclog.Warnf("Error in LoadNodeListData: Metric %s for cluster %s not configured", metric, cluster)
|
||||
return nil, totalNodes, hasNextPage, errors.New("Prometheus config error")
|
||||
}
|
||||
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while formatting prometheus query")
|
||||
return nil, totalNodes, hasNextPage, err
|
||||
}
|
||||
|
||||
// ranged query over all nodes
|
||||
r := promv1.Range{
|
||||
Start: from,
|
||||
End: to,
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
if err != nil {
|
||||
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
|
||||
return nil, totalNodes, hasNextPage, errors.New("Prometheus query error")
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
cclog.Warnf("Warnings: %v\n", warnings)
|
||||
}
|
||||
|
||||
step := int64(metricConfig.Timestep)
|
||||
steps := int64(to.Sub(from).Seconds()) / step
|
||||
|
||||
// iter rows of host, metric, values
|
||||
for _, row := range result.(promm.Matrix) {
|
||||
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
|
||||
|
||||
hostdata, ok := data[hostname]
|
||||
if !ok {
|
||||
hostdata = make(schema.JobData)
|
||||
data[hostname] = hostdata
|
||||
}
|
||||
|
||||
metricdata, ok := hostdata[metric]
|
||||
if !ok {
|
||||
metricdata = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
data[hostname][metric] = metricdata
|
||||
}
|
||||
|
||||
// output per host, metric and scope
|
||||
scopeData, ok := metricdata[scope]
|
||||
if !ok {
|
||||
scopeData = &schema.JobMetric{
|
||||
Unit: metricConfig.Unit,
|
||||
Timestep: metricConfig.Timestep,
|
||||
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
|
||||
}
|
||||
data[hostname][metric][scope] = scopeData
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
t1 := time.Since(t0)
|
||||
cclog.Debugf("LoadNodeListData of %v nodes took %s", len(data), t1)
|
||||
return data, totalNodes, hasNextPage, nil
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
@@ -9,14 +10,15 @@ import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
// Only a mock for unit-testing.
|
||||
// TestMetricDataRepository is only a mock for unit-testing.
|
||||
type TestMetricDataRepository struct{}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
|
||||
@@ -27,15 +29,26 @@ func (tmdr *TestMetricDataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
|
||||
return TestLoadDataCallback(job, metrics, scopes, ctx)
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadScopedStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
@@ -44,7 +57,63 @@ func (tmdr *TestMetricDataRepository) LoadNodeData(
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadNodeListData(
|
||||
cluster, subCluster, nodeFilter string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
page *model.PageRequest,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, int, bool, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func DeepCopy(jdTemp schema.JobData) schema.JobData {
|
||||
jd := make(schema.JobData, len(jdTemp))
|
||||
for k, v := range jdTemp {
|
||||
jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jdTemp[k]))
|
||||
for k_, v_ := range v {
|
||||
jd[k][k_] = new(schema.JobMetric)
|
||||
jd[k][k_].Series = make([]schema.Series, len(v_.Series))
|
||||
for i := 0; i < len(v_.Series); i += 1 {
|
||||
jd[k][k_].Series[i].Data = make([]schema.Float, len(v_.Series[i].Data))
|
||||
copy(jd[k][k_].Series[i].Data, v_.Series[i].Data)
|
||||
jd[k][k_].Series[i].Hostname = v_.Series[i].Hostname
|
||||
jd[k][k_].Series[i].Id = v_.Series[i].Id
|
||||
jd[k][k_].Series[i].Statistics.Avg = v_.Series[i].Statistics.Avg
|
||||
jd[k][k_].Series[i].Statistics.Min = v_.Series[i].Statistics.Min
|
||||
jd[k][k_].Series[i].Statistics.Max = v_.Series[i].Statistics.Max
|
||||
}
|
||||
jd[k][k_].Timestep = v_.Timestep
|
||||
jd[k][k_].Unit.Base = v_.Unit.Base
|
||||
jd[k][k_].Unit.Prefix = v_.Unit.Prefix
|
||||
if v_.StatisticsSeries != nil {
|
||||
// Init Slices
|
||||
jd[k][k_].StatisticsSeries = new(schema.StatsSeries)
|
||||
jd[k][k_].StatisticsSeries.Max = make([]schema.Float, len(v_.StatisticsSeries.Max))
|
||||
jd[k][k_].StatisticsSeries.Min = make([]schema.Float, len(v_.StatisticsSeries.Min))
|
||||
jd[k][k_].StatisticsSeries.Median = make([]schema.Float, len(v_.StatisticsSeries.Median))
|
||||
jd[k][k_].StatisticsSeries.Mean = make([]schema.Float, len(v_.StatisticsSeries.Mean))
|
||||
// Copy Data
|
||||
copy(jd[k][k_].StatisticsSeries.Max, v_.StatisticsSeries.Max)
|
||||
copy(jd[k][k_].StatisticsSeries.Min, v_.StatisticsSeries.Min)
|
||||
copy(jd[k][k_].StatisticsSeries.Median, v_.StatisticsSeries.Median)
|
||||
copy(jd[k][k_].StatisticsSeries.Mean, v_.StatisticsSeries.Mean)
|
||||
// Handle Percentiles
|
||||
for k__, v__ := range v_.StatisticsSeries.Percentiles {
|
||||
jd[k][k_].StatisticsSeries.Percentiles[k__] = make([]schema.Float, len(v__))
|
||||
copy(jd[k][k_].StatisticsSeries.Percentiles[k__], v__)
|
||||
}
|
||||
} else {
|
||||
jd[k][k_].StatisticsSeries = v_.StatisticsSeries
|
||||
}
|
||||
}
|
||||
}
|
||||
return jd
|
||||
}
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/mattn/go-sqlite3"
|
||||
"github.com/qustavo/sqlhooks/v2"
|
||||
@@ -33,6 +35,27 @@ type DatabaseOptions struct {
|
||||
ConnectionMaxIdleTime time.Duration
|
||||
}
|
||||
|
||||
func setupSqlite(db *sql.DB) (err error) {
|
||||
pragmas := []string{
|
||||
// "journal_mode = WAL",
|
||||
// "busy_timeout = 5000",
|
||||
// "synchronous = NORMAL",
|
||||
// "cache_size = 1000000000", // 1GB
|
||||
// "foreign_keys = true",
|
||||
"temp_store = memory",
|
||||
// "mmap_size = 3000000000",
|
||||
}
|
||||
|
||||
for _, pragma := range pragmas {
|
||||
_, err = db.Exec("PRAGMA " + pragma)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func Connect(driver string, db string) {
|
||||
var err error
|
||||
var dbHandle *sqlx.DB
|
||||
@@ -48,28 +71,34 @@ func Connect(driver string, db string) {
|
||||
|
||||
switch driver {
|
||||
case "sqlite3":
|
||||
// - Set WAL mode (not strictly necessary each time because it's persisted in the database, but good for first run)
|
||||
// - Set busy timeout, so concurrent writers wait on each other instead of erroring immediately
|
||||
// - Enable foreign key checks
|
||||
opts.URL += "?_journal=WAL&_timeout=5000&_fk=true"
|
||||
// TODO: Have separate DB handles for Writes and Reads
|
||||
// Optimize SQLite connection: https://kerkour.com/sqlite-for-servers
|
||||
connectionUrlParams := make(url.Values)
|
||||
connectionUrlParams.Add("_txlock", "immediate")
|
||||
connectionUrlParams.Add("_journal_mode", "WAL")
|
||||
connectionUrlParams.Add("_busy_timeout", "5000")
|
||||
connectionUrlParams.Add("_synchronous", "NORMAL")
|
||||
connectionUrlParams.Add("_cache_size", "1000000000")
|
||||
connectionUrlParams.Add("_foreign_keys", "true")
|
||||
opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionUrlParams.Encode())
|
||||
|
||||
if log.Loglevel() == "debug" {
|
||||
if cclog.Loglevel() == "debug" {
|
||||
sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{}))
|
||||
dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL)
|
||||
} else {
|
||||
dbHandle, err = sqlx.Open("sqlite3", opts.URL)
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
setupSqlite(dbHandle.DB)
|
||||
case "mysql":
|
||||
opts.URL += "?multiStatements=true"
|
||||
dbHandle, err = sqlx.Open("mysql", opts.URL)
|
||||
if err != nil {
|
||||
log.Fatalf("sqlx.Open() error: %v", err)
|
||||
}
|
||||
default:
|
||||
log.Fatalf("unsupported database driver: %s", driver)
|
||||
cclog.Abortf("DB Connection: Unsupported database driver '%s'.\n", driver)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cclog.Abortf("DB Connection: Could not connect to '%s' database with sqlx.Open().\nError: %s\n", driver, err.Error())
|
||||
}
|
||||
|
||||
dbHandle.SetMaxOpenConns(opts.MaxOpenConnections)
|
||||
@@ -80,14 +109,14 @@ func Connect(driver string, db string) {
|
||||
dbConnInstance = &DBConnection{DB: dbHandle, Driver: driver}
|
||||
err = checkDBVersion(driver, dbHandle.DB)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
cclog.Abortf("DB Connection: Failed DB version check.\nError: %s\n", err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func GetConnection() *DBConnection {
|
||||
if dbConnInstance == nil {
|
||||
log.Fatalf("Database connection not initialized!")
|
||||
cclog.Fatalf("Database connection not initialized!")
|
||||
}
|
||||
|
||||
return dbConnInstance
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
@@ -8,21 +8,21 @@ import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
// Hooks satisfies the sqlhook.Hooks interface
|
||||
type Hooks struct{}
|
||||
|
||||
// Before hook will print the query with it's args and return the context with the timestamp
|
||||
func (h *Hooks) Before(ctx context.Context, query string, args ...interface{}) (context.Context, error) {
|
||||
log.Debugf("SQL query %s %q", query, args)
|
||||
func (h *Hooks) Before(ctx context.Context, query string, args ...any) (context.Context, error) {
|
||||
cclog.Debugf("SQL query %s %q", query, args)
|
||||
return context.WithValue(ctx, "begin", time.Now()), nil
|
||||
}
|
||||
|
||||
// After hook will get the timestamp registered on the Before hook and print the elapsed time
|
||||
func (h *Hooks) After(ctx context.Context, query string, args ...interface{}) (context.Context, error) {
|
||||
func (h *Hooks) After(ctx context.Context, query string, args ...any) (context.Context, error) {
|
||||
begin := ctx.Value("begin").(time.Time)
|
||||
log.Debugf("Took: %s\n", time.Since(begin))
|
||||
cclog.Debugf("Took: %s\n", time.Since(begin))
|
||||
return ctx, nil
|
||||
}
|
||||
|
||||
@@ -1,25 +1,24 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"maps"
|
||||
"math"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/lrucache"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
@@ -30,12 +29,11 @@ var (
|
||||
)
|
||||
|
||||
type JobRepository struct {
|
||||
DB *sqlx.DB
|
||||
stmtCache *sq.StmtCache
|
||||
cache *lrucache.Cache
|
||||
archiveChannel chan *schema.Job
|
||||
driver string
|
||||
archivePending sync.WaitGroup
|
||||
DB *sqlx.DB
|
||||
stmtCache *sq.StmtCache
|
||||
cache *lrucache.Cache
|
||||
driver string
|
||||
Mutex sync.Mutex
|
||||
}
|
||||
|
||||
func GetJobRepository() *JobRepository {
|
||||
@@ -46,47 +44,59 @@ func GetJobRepository() *JobRepository {
|
||||
DB: db.DB,
|
||||
driver: db.Driver,
|
||||
|
||||
stmtCache: sq.NewStmtCache(db.DB),
|
||||
cache: lrucache.New(1024 * 1024),
|
||||
archiveChannel: make(chan *schema.Job, 128),
|
||||
stmtCache: sq.NewStmtCache(db.DB),
|
||||
cache: lrucache.New(1024 * 1024),
|
||||
}
|
||||
// start archiving worker
|
||||
go jobRepoInstance.archivingWorker()
|
||||
})
|
||||
return jobRepoInstance
|
||||
}
|
||||
|
||||
var jobColumns []string = []string{
|
||||
"job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.partition", "job.array_job_id",
|
||||
"job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state",
|
||||
"job.duration", "job.walltime", "job.resources", "job.mem_used_max", "job.flops_any_avg", "job.mem_bw_avg", "job.load_avg", // "job.meta_data",
|
||||
"job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster",
|
||||
"job.start_time", "job.cluster_partition", "job.array_job_id", "job.num_nodes",
|
||||
"job.num_hwthreads", "job.num_acc", "job.shared", "job.monitoring_status",
|
||||
"job.smt", "job.job_state", "job.duration", "job.walltime", "job.resources",
|
||||
"job.footprint", "job.energy",
|
||||
}
|
||||
|
||||
func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
|
||||
var jobCacheColumns []string = []string{
|
||||
"job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.cluster",
|
||||
"job_cache.subcluster", "job_cache.start_time", "job_cache.cluster_partition",
|
||||
"job_cache.array_job_id", "job_cache.num_nodes", "job_cache.num_hwthreads",
|
||||
"job_cache.num_acc", "job_cache.shared", "job_cache.monitoring_status", "job_cache.smt",
|
||||
"job_cache.job_state", "job_cache.duration", "job_cache.walltime", "job_cache.resources",
|
||||
"job_cache.footprint", "job_cache.energy",
|
||||
}
|
||||
|
||||
func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) {
|
||||
job := &schema.Job{}
|
||||
|
||||
if err := row.Scan(
|
||||
&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId,
|
||||
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State,
|
||||
&job.Duration, &job.Walltime, &job.RawResources, &job.MemUsedMax, &job.FlopsAnyAvg, &job.MemBwAvg, &job.LoadAvg /*&job.RawMetaData*/); err != nil {
|
||||
log.Warnf("Error while scanning rows (Job): %v", err)
|
||||
&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster,
|
||||
&job.StartTime, &job.Partition, &job.ArrayJobID, &job.NumNodes, &job.NumHWThreads,
|
||||
&job.NumAcc, &job.Shared, &job.MonitoringStatus, &job.SMT, &job.State,
|
||||
&job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint, &job.Energy); err != nil {
|
||||
cclog.Warnf("Error while scanning rows (Job): %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
|
||||
log.Warn("Error while unmarhsaling raw resources json")
|
||||
cclog.Warn("Error while unmarshaling raw resources json")
|
||||
return nil, err
|
||||
}
|
||||
job.RawResources = nil
|
||||
|
||||
// if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
|
||||
// return nil, err
|
||||
// }
|
||||
if err := json.Unmarshal(job.RawFootprint, &job.Footprint); err != nil {
|
||||
cclog.Warnf("Error while unmarshaling raw footprint json: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
job.RawFootprint = nil
|
||||
|
||||
job.StartTime = time.Unix(job.StartTimeUnix, 0)
|
||||
if job.Duration == 0 && job.State == schema.JobStateRunning {
|
||||
job.Duration = int32(time.Since(job.StartTime).Seconds())
|
||||
// Always ensure accurate duration for running jobs
|
||||
if job.State == schema.JobStateRunning {
|
||||
job.Duration = int32(time.Now().Unix() - job.StartTime)
|
||||
}
|
||||
|
||||
job.RawResources = nil
|
||||
return job, nil
|
||||
}
|
||||
|
||||
@@ -99,7 +109,7 @@ func (r *JobRepository) Optimize() error {
|
||||
return err
|
||||
}
|
||||
case "mysql":
|
||||
log.Info("Optimize currently not supported for mysql driver")
|
||||
cclog.Info("Optimize currently not supported for mysql driver")
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -140,17 +150,6 @@ func (r *JobRepository) Flush() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func scanJobLink(row interface{ Scan(...interface{}) error }) (*model.JobLink, error) {
|
||||
jobLink := &model.JobLink{}
|
||||
if err := row.Scan(
|
||||
&jobLink.ID, &jobLink.JobID); err != nil {
|
||||
log.Warn("Error while scanning rows (jobLink)")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return jobLink, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) {
|
||||
start := time.Now()
|
||||
cachekey := fmt.Sprintf("metadata:%d", job.ID)
|
||||
@@ -161,7 +160,7 @@ func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error
|
||||
|
||||
if err := sq.Select("job.meta_data").From("job").Where("job.id = ?", job.ID).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&job.RawMetaData); err != nil {
|
||||
log.Warn("Error while scanning for job metadata")
|
||||
cclog.Warn("Error while scanning for job metadata")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -170,12 +169,12 @@ func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
|
||||
log.Warn("Error while unmarshaling raw metadata json")
|
||||
cclog.Warn("Error while unmarshaling raw metadata json")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour)
|
||||
log.Debugf("Timer FetchMetadata %s", time.Since(start))
|
||||
cclog.Debugf("Timer FetchMetadata %s", time.Since(start))
|
||||
return job.MetaData, nil
|
||||
}
|
||||
|
||||
@@ -184,16 +183,14 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
|
||||
r.cache.Del(cachekey)
|
||||
if job.MetaData == nil {
|
||||
if _, err = r.FetchMetadata(job); err != nil {
|
||||
log.Warnf("Error while fetching metadata for job, DB ID '%v'", job.ID)
|
||||
cclog.Warnf("Error while fetching metadata for job, DB ID '%v'", job.ID)
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if job.MetaData != nil {
|
||||
cpy := make(map[string]string, len(job.MetaData)+1)
|
||||
for k, v := range job.MetaData {
|
||||
cpy[k] = v
|
||||
}
|
||||
maps.Copy(cpy, job.MetaData)
|
||||
cpy[key] = val
|
||||
job.MetaData = cpy
|
||||
} else {
|
||||
@@ -201,12 +198,15 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
|
||||
}
|
||||
|
||||
if job.RawMetaData, err = json.Marshal(job.MetaData); err != nil {
|
||||
log.Warnf("Error while marshaling metadata for job, DB ID '%v'", job.ID)
|
||||
cclog.Warnf("Error while marshaling metadata for job, DB ID '%v'", job.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err = sq.Update("job").Set("meta_data", job.RawMetaData).Where("job.id = ?", job.ID).RunWith(r.stmtCache).Exec(); err != nil {
|
||||
log.Warnf("Error while updating metadata for job, DB ID '%v'", job.ID)
|
||||
if _, err = sq.Update("job").
|
||||
Set("meta_data", job.RawMetaData).
|
||||
Where("job.id = ?", job.ID).
|
||||
RunWith(r.stmtCache).Exec(); err != nil {
|
||||
cclog.Warnf("Error while updating metadata for job, DB ID '%v'", job.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -214,222 +214,54 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
|
||||
return archive.UpdateMetadata(job, job.MetaData)
|
||||
}
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) Find(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) (*schema.Job, error) {
|
||||
func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
log.Debugf("Timer Find %s", time.Since(start))
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindAll(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) ([]*schema.Job, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Error("Error while running query")
|
||||
if err := sq.Select("job.footprint").From("job").Where("job.id = ?", job.ID).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&job.RawFootprint); err != nil {
|
||||
cclog.Warn("Error while scanning for job footprint")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 10)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
log.Debugf("Timer FindAll %s", time.Since(start))
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// FindById executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindById(jobId int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindConcurrentJobs(
|
||||
ctx context.Context,
|
||||
job *schema.Job,
|
||||
) (*model.JobLinkResultList, error) {
|
||||
if job == nil {
|
||||
if len(job.RawFootprint) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("job.id", "job.job_id", "job.start_time").From("job"))
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
query = query.Where("cluster = ?", job.Cluster)
|
||||
var startTime int64
|
||||
var stopTime int64
|
||||
|
||||
startTime = job.StartTimeUnix
|
||||
hostname := job.Resources[0].Hostname
|
||||
|
||||
if job.State == schema.JobStateRunning {
|
||||
stopTime = time.Now().Unix()
|
||||
} else {
|
||||
stopTime = startTime + int64(job.Duration)
|
||||
}
|
||||
|
||||
// Add 200s overlap for jobs start time at the end
|
||||
startTimeTail := startTime + 10
|
||||
stopTimeTail := stopTime - 200
|
||||
startTimeFront := startTime + 200
|
||||
|
||||
queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTime)
|
||||
queryRunning = queryRunning.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%"))
|
||||
|
||||
query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
|
||||
query = query.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%"))
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Errorf("Error while running query: %v", err)
|
||||
if err := json.Unmarshal(job.RawFootprint, &job.Footprint); err != nil {
|
||||
cclog.Warn("Error while unmarshaling raw footprint json")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
items := make([]*model.JobLink, 0, 10)
|
||||
queryString := fmt.Sprintf("cluster=%s", job.Cluster)
|
||||
cclog.Debugf("Timer FetchFootprint %s", time.Since(start))
|
||||
return job.Footprint, nil
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
|
||||
if err = rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
})
|
||||
}
|
||||
func (r *JobRepository) FetchEnergyFootprint(job *schema.Job) (map[string]float64, error) {
|
||||
start := time.Now()
|
||||
cachekey := fmt.Sprintf("energyFootprint:%d", job.ID)
|
||||
if cached := r.cache.Get(cachekey, nil); cached != nil {
|
||||
job.EnergyFootprint = cached.(map[string]float64)
|
||||
return job.EnergyFootprint, nil
|
||||
}
|
||||
|
||||
rows, err = queryRunning.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Errorf("Error while running query: %v", err)
|
||||
if err := sq.Select("job.energy_footprint").From("job").Where("job.id = ?", job.ID).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&job.RawEnergyFootprint); err != nil {
|
||||
cclog.Warn("Error while scanning for job energy_footprint")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
|
||||
if err := rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
})
|
||||
}
|
||||
if len(job.RawEnergyFootprint) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
cnt := len(items)
|
||||
|
||||
return &model.JobLinkResultList{
|
||||
ListQuery: &queryString,
|
||||
Items: items,
|
||||
Count: &cnt,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Start inserts a new job in the table, returning the unique job ID.
|
||||
// Statistics are not transfered!
|
||||
func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) {
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
|
||||
if err := json.Unmarshal(job.RawEnergyFootprint, &job.EnergyFootprint); err != nil {
|
||||
cclog.Warn("Error while unmarshaling raw energy footprint json")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
|
||||
}
|
||||
|
||||
res, err := r.DB.NamedExec(`INSERT INTO job (
|
||||
job_id, user, project, cluster, subcluster, `+"`partition`"+`, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data
|
||||
) VALUES (
|
||||
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data
|
||||
);`, job)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
return res.LastInsertId()
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) Stop(
|
||||
jobId int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32,
|
||||
) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
r.cache.Put(cachekey, job.EnergyFootprint, len(job.EnergyFootprint), 24*time.Hour)
|
||||
cclog.Debugf("Timer FetchEnergyFootprint %s", time.Since(start))
|
||||
return job.EnergyFootprint, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) DeleteJobsBefore(startTime int64) (int, error) {
|
||||
@@ -441,9 +273,9 @@ func (r *JobRepository) DeleteJobsBefore(startTime int64) (int, error) {
|
||||
|
||||
if err != nil {
|
||||
s, _, _ := qd.ToSql()
|
||||
log.Errorf(" DeleteJobsBefore(%d) with %s: error %#v", startTime, s, err)
|
||||
cclog.Errorf(" DeleteJobsBefore(%d) with %s: error %#v", startTime, s, err)
|
||||
} else {
|
||||
log.Debugf("DeleteJobsBefore(%d): Deleted %d jobs", startTime, cnt)
|
||||
cclog.Debugf("DeleteJobsBefore(%d): Deleted %d jobs", startTime, cnt)
|
||||
}
|
||||
return cnt, err
|
||||
}
|
||||
@@ -454,126 +286,29 @@ func (r *JobRepository) DeleteJobById(id int64) error {
|
||||
|
||||
if err != nil {
|
||||
s, _, _ := qd.ToSql()
|
||||
log.Errorf("DeleteJobById(%d) with %s : error %#v", id, s, err)
|
||||
cclog.Errorf("DeleteJobById(%d) with %s : error %#v", id, s, err)
|
||||
} else {
|
||||
log.Debugf("DeleteJobById(%d): Success", id)
|
||||
cclog.Debugf("DeleteJobById(%d): Success", id)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", job)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) MarkArchived(
|
||||
jobId int64,
|
||||
monitoringStatus int32,
|
||||
metricStats map[string]schema.JobStatistics,
|
||||
) error {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
|
||||
for metric, stats := range metricStats {
|
||||
switch metric {
|
||||
case "flops_any":
|
||||
stmt = stmt.Set("flops_any_avg", stats.Avg)
|
||||
case "mem_used":
|
||||
stmt = stmt.Set("mem_used_max", stats.Max)
|
||||
case "mem_bw":
|
||||
stmt = stmt.Set("mem_bw_avg", stats.Avg)
|
||||
case "load":
|
||||
stmt = stmt.Set("load_avg", stats.Avg)
|
||||
case "cpu_load":
|
||||
stmt = stmt.Set("load_avg", stats.Avg)
|
||||
case "net_bw":
|
||||
stmt = stmt.Set("net_bw_avg", stats.Avg)
|
||||
case "file_bw":
|
||||
stmt = stmt.Set("file_bw_avg", stats.Avg)
|
||||
default:
|
||||
log.Debugf("MarkArchived() Metric '%v' unknown", metric)
|
||||
}
|
||||
}
|
||||
|
||||
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
log.Warn("Error while marking job as archived")
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Archiving worker thread
|
||||
func (r *JobRepository) archivingWorker() {
|
||||
for {
|
||||
select {
|
||||
case job, ok := <-r.archiveChannel:
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
start := time.Now()
|
||||
// not using meta data, called to load JobMeta into Cache?
|
||||
// will fail if job meta not in repository
|
||||
if _, err := r.FetchMetadata(job); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
||||
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// metricdata.ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
|
||||
// TODO: Maybe use context with cancel/timeout here
|
||||
jobMeta, err := metricdata.ArchiveJob(job, context.Background())
|
||||
if err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
||||
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// Update the jobs database entry one last time:
|
||||
if err := r.MarkArchived(job.ID, schema.MonitoringStatusArchivingSuccessful, jobMeta.Statistics); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at marking archived step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
log.Printf("archiving job (dbid: %d) successful", job.ID)
|
||||
r.archivePending.Done()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Trigger async archiving
|
||||
func (r *JobRepository) TriggerArchiving(job *schema.Job) {
|
||||
r.archivePending.Add(1)
|
||||
r.archiveChannel <- job
|
||||
}
|
||||
|
||||
// Wait for background thread to finish pending archiving operations
|
||||
func (r *JobRepository) WaitForArchiving() {
|
||||
// close channel and wait for worker to process remaining jobs
|
||||
r.archivePending.Wait()
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindUserOrProjectOrJobname(user *schema.User, searchterm string) (jobid string, username string, project string, jobname string) {
|
||||
if _, err := strconv.Atoi(searchterm); err == nil { // Return empty on successful conversion: parent method will redirect for integer jobId
|
||||
return searchterm, "", "", ""
|
||||
} else { // Has to have letters and logged-in user for other guesses
|
||||
if user != nil {
|
||||
// Find username in jobs (match)
|
||||
uresult, _ := r.FindColumnValue(user, searchterm, "job", "user", "user", false)
|
||||
// Find username by username in job table (match)
|
||||
uresult, _ := r.FindColumnValue(user, searchterm, "job", "hpc_user", "hpc_user", false)
|
||||
if uresult != "" {
|
||||
return "", uresult, "", ""
|
||||
}
|
||||
// Find username by name (like)
|
||||
nresult, _ := r.FindColumnValue(user, searchterm, "user", "username", "name", true)
|
||||
// Find username by real name in hpc_user table (like)
|
||||
nresult, _ := r.FindColumnValue(user, searchterm, "hpc_user", "username", "name", true)
|
||||
if nresult != "" {
|
||||
return "", nresult, "", ""
|
||||
}
|
||||
// Find projectId in jobs (match)
|
||||
// Find projectId by projectId in job table (match)
|
||||
presult, _ := r.FindColumnValue(user, searchterm, "job", "project", "project", false)
|
||||
if presult != "" {
|
||||
return "", "", presult, ""
|
||||
@@ -602,10 +337,10 @@ func (r *JobRepository) FindColumnValue(user *schema.User, searchterm string, ta
|
||||
|
||||
// theSql, args, theErr := theQuery.ToSql()
|
||||
// if theErr != nil {
|
||||
// log.Warn("Error while converting query to sql")
|
||||
// cclog.Warn("Error while converting query to sql")
|
||||
// return "", err
|
||||
// }
|
||||
// log.Debugf("SQL query (FindColumnValue): `%s`, args: %#v", theSql, args)
|
||||
// cclog.Debugf("SQL query (FindColumnValue): `%s`, args: %#v", theSql, args)
|
||||
|
||||
err := theQuery.RunWith(r.stmtCache).QueryRow().Scan(&result)
|
||||
|
||||
@@ -616,7 +351,7 @@ func (r *JobRepository) FindColumnValue(user *schema.User, searchterm string, ta
|
||||
}
|
||||
return "", ErrNotFound
|
||||
} else {
|
||||
log.Infof("Non-Admin User %s : Requested Query '%s' on table '%s' : Forbidden", user.Name, query, table)
|
||||
cclog.Infof("Non-Admin User %s : Requested Query '%s' on table '%s' : Forbidden", user.Name, query, table)
|
||||
return "", ErrForbidden
|
||||
}
|
||||
}
|
||||
@@ -635,7 +370,7 @@ func (r *JobRepository) FindColumnValues(user *schema.User, query string, table
|
||||
err := rows.Scan(&result)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
log.Warnf("Error while scanning rows: %v", err)
|
||||
cclog.Warnf("Error while scanning rows: %v", err)
|
||||
return emptyResult, err
|
||||
}
|
||||
results = append(results, result)
|
||||
@@ -645,7 +380,7 @@ func (r *JobRepository) FindColumnValues(user *schema.User, query string, table
|
||||
return emptyResult, ErrNotFound
|
||||
|
||||
} else {
|
||||
log.Infof("Non-Admin User %s : Requested Query '%s' on table '%s' : Forbidden", user.Name, query, table)
|
||||
cclog.Infof("Non-Admin User %s : Requested Query '%s' on table '%s' : Forbidden", user.Name, query, table)
|
||||
return emptyResult, ErrForbidden
|
||||
}
|
||||
}
|
||||
@@ -653,9 +388,9 @@ func (r *JobRepository) FindColumnValues(user *schema.User, query string, table
|
||||
func (r *JobRepository) Partitions(cluster string) ([]string, error) {
|
||||
var err error
|
||||
start := time.Now()
|
||||
partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) {
|
||||
partitions := r.cache.Get("partitions:"+cluster, func() (any, time.Duration, int) {
|
||||
parts := []string{}
|
||||
if err = r.DB.Select(&parts, `SELECT DISTINCT job.partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
|
||||
if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
|
||||
return nil, 0, 1000
|
||||
}
|
||||
|
||||
@@ -664,7 +399,7 @@ func (r *JobRepository) Partitions(cluster string) ([]string, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
log.Debugf("Timer Partitions %s", time.Since(start))
|
||||
cclog.Debugf("Timer Partitions %s", time.Since(start))
|
||||
return partitions.([]string), nil
|
||||
}
|
||||
|
||||
@@ -678,7 +413,7 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
|
||||
Where("job.cluster = ?", cluster).
|
||||
RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Error("Error while running query")
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -689,11 +424,11 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
|
||||
var resources []*schema.Resource
|
||||
var subcluster string
|
||||
if err := rows.Scan(&raw, &subcluster); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
if err := json.Unmarshal(raw, &resources); err != nil {
|
||||
log.Warn("Error while unmarshaling raw resources json")
|
||||
cclog.Warn("Error while unmarshaling raw resources json")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -708,10 +443,11 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
|
||||
}
|
||||
}
|
||||
|
||||
log.Debugf("Timer AllocatedNodes %s", time.Since(start))
|
||||
cclog.Debugf("Timer AllocatedNodes %s", time.Since(start))
|
||||
return subclusters, nil
|
||||
}
|
||||
|
||||
// FIXME: Set duration to requested walltime?
|
||||
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
||||
start := time.Now()
|
||||
res, err := sq.Update("job").
|
||||
@@ -723,20 +459,87 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
||||
Where(fmt.Sprintf("(%d - job.start_time) > (job.walltime + %d)", time.Now().Unix(), seconds)).
|
||||
RunWith(r.DB).Exec()
|
||||
if err != nil {
|
||||
log.Warn("Error while stopping jobs exceeding walltime")
|
||||
cclog.Warn("Error while stopping jobs exceeding walltime")
|
||||
return err
|
||||
}
|
||||
|
||||
rowsAffected, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
log.Warn("Error while fetching affected rows after stopping due to exceeded walltime")
|
||||
cclog.Warn("Error while fetching affected rows after stopping due to exceeded walltime")
|
||||
return err
|
||||
}
|
||||
|
||||
if rowsAffected > 0 {
|
||||
log.Infof("%d jobs have been marked as failed due to running too long", rowsAffected)
|
||||
cclog.Infof("%d jobs have been marked as failed due to running too long", rowsAffected)
|
||||
}
|
||||
log.Debugf("Timer StopJobsExceedingWalltimeBy %s", time.Since(start))
|
||||
cclog.Debugf("Timer StopJobsExceedingWalltimeBy %s", time.Since(start))
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindJobIdsByTag(tagId int64) ([]int64, error) {
|
||||
query := sq.Select("job.id").From("job").
|
||||
Join("jobtag ON jobtag.job_id = job.id").
|
||||
Where(sq.Eq{"jobtag.tag_id": tagId}).Distinct()
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
jobIds := make([]int64, 0, 100)
|
||||
|
||||
for rows.Next() {
|
||||
var jobId int64
|
||||
|
||||
if err := rows.Scan(&jobId); err != nil {
|
||||
rows.Close()
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobIds = append(jobIds, jobId)
|
||||
}
|
||||
|
||||
return jobIds, nil
|
||||
}
|
||||
|
||||
// FIXME: Reconsider filtering short jobs with harcoded threshold
|
||||
func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) {
|
||||
query := sq.Select(jobColumns...).From("job").
|
||||
Where(fmt.Sprintf("job.cluster = '%s'", cluster)).
|
||||
Where("job.job_state = 'running'").
|
||||
Where("job.duration > 600")
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 50)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
cclog.Infof("Return job count %d", len(jobs))
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateDuration() error {
|
||||
stmnt := sq.Update("job").
|
||||
Set("duration", sq.Expr("? - job.start_time", time.Now().Unix())).
|
||||
Where("job_state = 'running'")
|
||||
|
||||
_, err := stmnt.RunWith(r.stmtCache).Exec()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -748,18 +551,18 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64
|
||||
}
|
||||
|
||||
if startTimeBegin == 0 {
|
||||
log.Infof("Find jobs before %d", startTimeEnd)
|
||||
cclog.Infof("Find jobs before %d", startTimeEnd)
|
||||
query = sq.Select(jobColumns...).From("job").Where(fmt.Sprintf(
|
||||
"job.start_time < %d", startTimeEnd))
|
||||
} else {
|
||||
log.Infof("Find jobs between %d and %d", startTimeBegin, startTimeEnd)
|
||||
cclog.Infof("Find jobs between %d and %d", startTimeBegin, startTimeEnd)
|
||||
query = sq.Select(jobColumns...).From("job").Where(fmt.Sprintf(
|
||||
"job.start_time BETWEEN %d AND %d", startTimeBegin, startTimeEnd))
|
||||
}
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Error("Error while running query")
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -768,37 +571,128 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
log.Warn("Error while scanning rows")
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
log.Infof("Return job count %d", len(jobs))
|
||||
cclog.Infof("Return job count %d", len(jobs))
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
const NamedJobInsert string = `INSERT INTO job (
|
||||
job_id, user, project, cluster, subcluster, ` + "`partition`" + `, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data,
|
||||
mem_used_max, flops_any_avg, mem_bw_avg, load_avg, net_bw_avg, net_data_vol_total, file_bw_avg, file_data_vol_total
|
||||
) VALUES (
|
||||
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data,
|
||||
:mem_used_max, :flops_any_avg, :mem_bw_avg, :load_avg, :net_bw_avg, :net_data_vol_total, :file_bw_avg, :file_data_vol_total
|
||||
);`
|
||||
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", job)
|
||||
|
||||
func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {
|
||||
res, err := r.DB.NamedExec(NamedJobInsert, job)
|
||||
if err != nil {
|
||||
log.Warn("Error while NamedJobInsert")
|
||||
return 0, err
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
log.Warn("Error while getting last insert ID")
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return id, nil
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return err
|
||||
}
|
||||
|
||||
func (r *JobRepository) Execute(stmt sq.UpdateBuilder) error {
|
||||
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) MarkArchived(
|
||||
stmt sq.UpdateBuilder,
|
||||
monitoringStatus int32,
|
||||
) sq.UpdateBuilder {
|
||||
return stmt.Set("monitoring_status", monitoringStatus)
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateEnergy(
|
||||
stmt sq.UpdateBuilder,
|
||||
jobMeta *schema.Job,
|
||||
) (sq.UpdateBuilder, error) {
|
||||
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
|
||||
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return stmt, err
|
||||
}
|
||||
energyFootprint := make(map[string]float64)
|
||||
|
||||
// Total Job Energy Outside Loop
|
||||
totalEnergy := 0.0
|
||||
for _, fp := range sc.EnergyFootprint {
|
||||
// Always Init Metric Energy Inside Loop
|
||||
metricEnergy := 0.0
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
|
||||
// Note: For DB data, calculate and save as kWh
|
||||
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules or Wh)
|
||||
cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp)
|
||||
// FIXME: Needs sum as stats type
|
||||
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
|
||||
// Energy: Power (in Watts) * Time (in Seconds)
|
||||
// Unit: (W * (s / 3600)) / 1000 = kWh
|
||||
// Round 2 Digits: round(Energy * 100) / 100
|
||||
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
|
||||
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
|
||||
rawEnergy := ((LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0
|
||||
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)
|
||||
}
|
||||
|
||||
energyFootprint[fp] = metricEnergy
|
||||
totalEnergy += metricEnergy
|
||||
|
||||
// cclog.Infof("Metric %s Average %f -> %f kWh | Job %d Total -> %f kWh", fp, LoadJobStat(jobMeta, fp, "avg"), energy, jobMeta.JobID, totalEnergy)
|
||||
}
|
||||
|
||||
var rawFootprint []byte
|
||||
if rawFootprint, err = json.Marshal(energyFootprint); err != nil {
|
||||
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
|
||||
return stmt, err
|
||||
}
|
||||
|
||||
return stmt.Set("energy_footprint", string(rawFootprint)).Set("energy", (math.Round(totalEnergy*100.0) / 100.0)), nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateFootprint(
|
||||
stmt sq.UpdateBuilder,
|
||||
jobMeta *schema.Job,
|
||||
) (sq.UpdateBuilder, error) {
|
||||
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
|
||||
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return stmt, err
|
||||
}
|
||||
footprint := make(map[string]float64)
|
||||
|
||||
for _, fp := range sc.Footprint {
|
||||
var statType string
|
||||
for _, gm := range archive.GlobalMetricList {
|
||||
if gm.Name == fp {
|
||||
statType = gm.Footprint
|
||||
}
|
||||
}
|
||||
|
||||
if statType != "avg" && statType != "min" && statType != "max" {
|
||||
cclog.Warnf("unknown statType for footprint update: %s", statType)
|
||||
return stmt, fmt.Errorf("unknown statType for footprint update: %s", statType)
|
||||
}
|
||||
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
|
||||
statType = sc.MetricConfig[i].Footprint
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||
footprint[name] = LoadJobStat(jobMeta, fp, statType)
|
||||
}
|
||||
|
||||
var rawFootprint []byte
|
||||
if rawFootprint, err = json.Marshal(footprint); err != nil {
|
||||
cclog.Warnf("Error while marshaling footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
|
||||
return stmt, err
|
||||
}
|
||||
|
||||
return stmt.Set("footprint", string(rawFootprint)), nil
|
||||
}
|
||||
|
||||
140
internal/repository/jobCreate.go
Normal file
140
internal/repository/jobCreate.go
Normal file
@@ -0,0 +1,140 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
const NamedJobCacheInsert string = `INSERT INTO job_cache (
|
||||
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
|
||||
) VALUES (
|
||||
:job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
|
||||
);`
|
||||
|
||||
const NamedJobInsert string = `INSERT INTO job (
|
||||
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
|
||||
) VALUES (
|
||||
:job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
|
||||
);`
|
||||
|
||||
func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {
|
||||
r.Mutex.Lock()
|
||||
res, err := r.DB.NamedExec(NamedJobCacheInsert, job)
|
||||
r.Mutex.Unlock()
|
||||
if err != nil {
|
||||
cclog.Warn("Error while NamedJobInsert")
|
||||
return 0, err
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
cclog.Warn("Error while getting last insert ID")
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return id, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) SyncJobs() ([]*schema.Job, error) {
|
||||
r.Mutex.Lock()
|
||||
defer r.Mutex.Unlock()
|
||||
|
||||
query := sq.Select(jobCacheColumns...).From("job_cache")
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while running query %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 50)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
_, err = r.DB.Exec(
|
||||
"INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache")
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while Job sync: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache")
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while Job cache clean: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// Start inserts a new job in the table, returning the unique job ID.
|
||||
// Statistics are not transfered!
|
||||
func (r *JobRepository) Start(job *schema.Job) (id int64, err error) {
|
||||
job.RawFootprint, err = json.Marshal(job.Footprint)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err)
|
||||
}
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
|
||||
}
|
||||
|
||||
return r.InsertJob(job)
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) Stop(
|
||||
jobId int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32,
|
||||
) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return err
|
||||
}
|
||||
|
||||
func (r *JobRepository) StopCached(
|
||||
jobId int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32,
|
||||
) (err error) {
|
||||
stmt := sq.Update("job_cache").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return err
|
||||
}
|
||||
312
internal/repository/jobFind.go
Normal file
312
internal/repository/jobFind.go
Normal file
@@ -0,0 +1,312 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) Find(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) (*schema.Job, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
q = q.OrderBy("job.id DESC") // always use newest matching job by db id if more than one match
|
||||
|
||||
cclog.Debugf("Timer Find %s", time.Since(start))
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindCached(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) (*schema.Job, error) {
|
||||
q := sq.Select(jobCacheColumns...).From("job_cache").
|
||||
Where("job_cache.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job_cache.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job_cache.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindAll(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) ([]*schema.Job, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 10)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
cclog.Debugf("Timer FindAll %s", time.Since(start))
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// Get complete joblist only consisting of db ids.
|
||||
// This is useful to process large job counts and intended to be used
|
||||
// together with FindById to process jobs one by one
|
||||
func (r *JobRepository) GetJobList() ([]int64, error) {
|
||||
query := sq.Select("id").From("job").
|
||||
Where("job.job_state != 'running'")
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jl := make([]int64, 0, 1000)
|
||||
for rows.Next() {
|
||||
var id int64
|
||||
err := rows.Scan(&id)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jl = append(jl, id)
|
||||
}
|
||||
|
||||
cclog.Infof("Return job count %d", len(jl))
|
||||
return jl, nil
|
||||
}
|
||||
|
||||
// FindById executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
|
||||
q, qerr := SecurityCheck(ctx, q)
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByIdWithUser executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id. The user is passed directly,
|
||||
// instead as part of the context.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
|
||||
q, qerr := SecurityCheckWithUser(user, q)
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByIdDirect executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByIdDirect(jobId int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByJobId executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the slurm id and the clustername.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime int64, cluster string) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").
|
||||
Where("job.job_id = ?", jobId).
|
||||
Where("job.cluster = ?", cluster).
|
||||
Where("job.start_time = ?", startTime)
|
||||
|
||||
q, qerr := SecurityCheck(ctx, q)
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// IsJobOwner executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the slurm id,a username and the cluster.
|
||||
// It returns a bool.
|
||||
// If job was found, user is owner: test err != sql.ErrNoRows
|
||||
func (r *JobRepository) IsJobOwner(jobId int64, startTime int64, user string, cluster string) bool {
|
||||
q := sq.Select("id").
|
||||
From("job").
|
||||
Where("job.job_id = ?", jobId).
|
||||
Where("job.hpc_user = ?", user).
|
||||
Where("job.cluster = ?", cluster).
|
||||
Where("job.start_time = ?", startTime)
|
||||
|
||||
_, err := scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
return err != sql.ErrNoRows
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindConcurrentJobs(
|
||||
ctx context.Context,
|
||||
job *schema.Job,
|
||||
) (*model.JobLinkResultList, error) {
|
||||
if job == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("job.id", "job.job_id", "job.start_time").From("job"))
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
query = query.Where("cluster = ?", job.Cluster)
|
||||
var startTime int64
|
||||
var stopTime int64
|
||||
|
||||
startTime = job.StartTime
|
||||
hostname := job.Resources[0].Hostname
|
||||
|
||||
if job.State == schema.JobStateRunning {
|
||||
stopTime = time.Now().Unix()
|
||||
} else {
|
||||
stopTime = startTime + int64(job.Duration)
|
||||
}
|
||||
|
||||
// Add 200s overlap for jobs start time at the end
|
||||
startTimeTail := startTime + 10
|
||||
stopTimeTail := stopTime - 200
|
||||
startTimeFront := startTime + 200
|
||||
|
||||
queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTime)
|
||||
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
|
||||
queryRunning = queryRunning.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
|
||||
|
||||
query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
|
||||
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
|
||||
query = query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while running query: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
items := make([]*model.JobLink, 0, 10)
|
||||
queryString := fmt.Sprintf("cluster=%s", job.Cluster)
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
|
||||
if err = rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
rows, err = queryRunning.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while running query: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
|
||||
if err := rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
cnt := len(items)
|
||||
|
||||
return &model.JobLinkResultList{
|
||||
ListQuery: &queryString,
|
||||
Items: items,
|
||||
Count: &cnt,
|
||||
}, nil
|
||||
}
|
||||
57
internal/repository/jobHooks.go
Normal file
57
internal/repository/jobHooks.go
Normal file
@@ -0,0 +1,57 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
type JobHook interface {
|
||||
JobStartCallback(job *schema.Job)
|
||||
JobStopCallback(job *schema.Job)
|
||||
}
|
||||
|
||||
var (
|
||||
initOnce sync.Once
|
||||
hooks []JobHook
|
||||
)
|
||||
|
||||
func RegisterJobJook(hook JobHook) {
|
||||
initOnce.Do(func() {
|
||||
hooks = make([]JobHook, 0)
|
||||
})
|
||||
|
||||
if hook != nil {
|
||||
hooks = append(hooks, hook)
|
||||
}
|
||||
}
|
||||
|
||||
func CallJobStartHooks(jobs []*schema.Job) {
|
||||
if hooks == nil {
|
||||
return
|
||||
}
|
||||
|
||||
for _, hook := range hooks {
|
||||
if hook != nil {
|
||||
for _, job := range jobs {
|
||||
hook.JobStartCallback(job)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func CallJobStopHooks(job *schema.Job) {
|
||||
if hooks == nil {
|
||||
return
|
||||
}
|
||||
|
||||
for _, hook := range hooks {
|
||||
if hook != nil {
|
||||
hook.JobStopCallback(job)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
@@ -12,9 +12,10 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
@@ -22,8 +23,8 @@ func (r *JobRepository) QueryJobs(
|
||||
ctx context.Context,
|
||||
filters []*model.JobFilter,
|
||||
page *model.PageRequest,
|
||||
order *model.OrderByInput) ([]*schema.Job, error) {
|
||||
|
||||
order *model.OrderByInput,
|
||||
) ([]*schema.Job, error) {
|
||||
query, qerr := SecurityCheck(ctx, sq.Select(jobColumns...).From("job"))
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
@@ -31,14 +32,28 @@ func (r *JobRepository) QueryJobs(
|
||||
|
||||
if order != nil {
|
||||
field := toSnakeCase(order.Field)
|
||||
|
||||
switch order.Order {
|
||||
case model.SortDirectionEnumAsc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
|
||||
case model.SortDirectionEnumDesc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
|
||||
default:
|
||||
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order")
|
||||
if order.Type == "col" {
|
||||
// "col": Fixed column name query
|
||||
switch order.Order {
|
||||
case model.SortDirectionEnumAsc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
|
||||
case model.SortDirectionEnumDesc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
|
||||
default:
|
||||
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for column")
|
||||
}
|
||||
} else {
|
||||
// "foot": Order by footprint JSON field values
|
||||
// Verify and Search Only in Valid Jsons
|
||||
query = query.Where("JSON_VALID(meta_data)")
|
||||
switch order.Order {
|
||||
case model.SortDirectionEnumAsc:
|
||||
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") ASC", field))
|
||||
case model.SortDirectionEnumDesc:
|
||||
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") DESC", field))
|
||||
default:
|
||||
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for footprint")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,7 +68,8 @@ func (r *JobRepository) QueryJobs(
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Errorf("Error while running query: %v", err)
|
||||
queryString, queryVars, _ := query.ToSql()
|
||||
cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -62,7 +78,7 @@ func (r *JobRepository) QueryJobs(
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
log.Warn("Error while scanning rows (Jobs)")
|
||||
cclog.Warn("Error while scanning rows (Jobs)")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
@@ -73,9 +89,10 @@ func (r *JobRepository) QueryJobs(
|
||||
|
||||
func (r *JobRepository) CountJobs(
|
||||
ctx context.Context,
|
||||
filters []*model.JobFilter) (int, error) {
|
||||
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("count(*)").From("job"))
|
||||
filters []*model.JobFilter,
|
||||
) (int, error) {
|
||||
// DISTICT count for tags filters, does not affect other queries
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("count(DISTINCT job.id)").From("job"))
|
||||
if qerr != nil {
|
||||
return 0, qerr
|
||||
}
|
||||
@@ -92,35 +109,48 @@ func (r *JobRepository) CountJobs(
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
|
||||
user := GetUserFromContext(ctx)
|
||||
func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBuilder, error) {
|
||||
if user == nil {
|
||||
var qnil sq.SelectBuilder
|
||||
return qnil, fmt.Errorf("user context is nil")
|
||||
} else if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport, schema.RoleApi}) { // Admin & Co. : All jobs
|
||||
return query, nil
|
||||
} else if user.HasRole(schema.RoleManager) { // Manager : Add filter for managed projects' jobs only + personal jobs
|
||||
if len(user.Projects) != 0 {
|
||||
return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.user": user.Username}}), nil
|
||||
} else {
|
||||
log.Debugf("Manager-User '%s' has no defined projects to lookup! Query only personal jobs ...", user.Username)
|
||||
return query.Where("job.user = ?", user.Username), nil
|
||||
}
|
||||
} else if user.HasRole(schema.RoleUser) { // User : Only personal jobs
|
||||
return query.Where("job.user = ?", user.Username), nil
|
||||
} else {
|
||||
// Shortterm compatibility: Return User-Query if no roles:
|
||||
return query.Where("job.user = ?", user.Username), nil
|
||||
// // On the longterm: Return Error instead of fallback:
|
||||
// var qnil sq.SelectBuilder
|
||||
// return qnil, fmt.Errorf("user '%s' with unknown roles [%#v]", user.Username, user.Roles)
|
||||
}
|
||||
|
||||
switch {
|
||||
case len(user.Roles) == 1 && user.HasRole(schema.RoleApi): // API-User : All jobs
|
||||
return query, nil
|
||||
case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): // Admin & Support : All jobs
|
||||
return query, nil
|
||||
case user.HasRole(schema.RoleManager): // Manager : Add filter for managed projects' jobs only + personal jobs
|
||||
if len(user.Projects) != 0 {
|
||||
return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.hpc_user": user.Username}}), nil
|
||||
} else {
|
||||
cclog.Debugf("Manager-User '%s' has no defined projects to lookup! Query only personal jobs ...", user.Username)
|
||||
return query.Where("job.hpc_user = ?", user.Username), nil
|
||||
}
|
||||
case user.HasRole(schema.RoleUser): // User : Only personal jobs
|
||||
return query.Where("job.hpc_user = ?", user.Username), nil
|
||||
default: // No known Role, return error
|
||||
var qnil sq.SelectBuilder
|
||||
return qnil, fmt.Errorf("user has no or unknown roles")
|
||||
}
|
||||
}
|
||||
|
||||
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
|
||||
user := GetUserFromContext(ctx)
|
||||
|
||||
return SecurityCheckWithUser(user, query)
|
||||
}
|
||||
|
||||
// Build a sq.SelectBuilder out of a schema.JobFilter.
|
||||
func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if filter.Tags != nil {
|
||||
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags})
|
||||
// This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query?
|
||||
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct()
|
||||
}
|
||||
if filter.DbID != nil {
|
||||
dbIDs := make([]string, len(filter.DbID))
|
||||
copy(dbIDs, filter.DbID)
|
||||
query = query.Where(sq.Eq{"job.id": dbIDs})
|
||||
}
|
||||
if filter.JobID != nil {
|
||||
query = buildStringCondition("job.job_id", filter.JobID, query)
|
||||
@@ -129,7 +159,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
||||
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
|
||||
}
|
||||
if filter.User != nil {
|
||||
query = buildStringCondition("job.user", filter.User, query)
|
||||
query = buildStringCondition("job.hpc_user", filter.User, query)
|
||||
}
|
||||
if filter.Project != nil {
|
||||
query = buildStringCondition("job.project", filter.Project, query)
|
||||
@@ -141,19 +171,21 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
||||
query = buildStringCondition("job.cluster", filter.Cluster, query)
|
||||
}
|
||||
if filter.Partition != nil {
|
||||
query = buildStringCondition("job.partition", filter.Partition, query)
|
||||
query = buildStringCondition("job.cluster_partition", filter.Partition, query)
|
||||
}
|
||||
if filter.StartTime != nil {
|
||||
query = buildTimeCondition("job.start_time", filter.StartTime, query)
|
||||
}
|
||||
if filter.Duration != nil {
|
||||
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
||||
query = query.Where("(CASE WHEN job.job_state = 'running' THEN (? - job.start_time) ELSE job.duration END) BETWEEN ? AND ?", now, filter.Duration.From, filter.Duration.To)
|
||||
query = buildIntCondition("job.duration", filter.Duration, query)
|
||||
}
|
||||
if filter.MinRunningFor != nil {
|
||||
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
||||
query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor)
|
||||
}
|
||||
if filter.Shared != nil {
|
||||
query = query.Where("job.shared = ?", *filter.Shared)
|
||||
}
|
||||
if filter.State != nil {
|
||||
states := make([]string, len(filter.State))
|
||||
for i, val := range filter.State {
|
||||
@@ -172,41 +204,60 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
||||
query = buildIntCondition("job.num_hwthreads", filter.NumHWThreads, query)
|
||||
}
|
||||
if filter.Node != nil {
|
||||
query = buildStringCondition("job.resources", filter.Node, query)
|
||||
query = buildResourceJsonCondition("hostname", filter.Node, query)
|
||||
}
|
||||
if filter.FlopsAnyAvg != nil {
|
||||
query = buildFloatCondition("job.flops_any_avg", filter.FlopsAnyAvg, query)
|
||||
if filter.Energy != nil {
|
||||
query = buildFloatCondition("job.energy", filter.Energy, query)
|
||||
}
|
||||
if filter.MemBwAvg != nil {
|
||||
query = buildFloatCondition("job.mem_bw_avg", filter.MemBwAvg, query)
|
||||
}
|
||||
if filter.LoadAvg != nil {
|
||||
query = buildFloatCondition("job.load_avg", filter.LoadAvg, query)
|
||||
}
|
||||
if filter.MemUsedMax != nil {
|
||||
query = buildFloatCondition("job.mem_used_max", filter.MemUsedMax, query)
|
||||
if filter.MetricStats != nil {
|
||||
for _, ms := range filter.MetricStats {
|
||||
query = buildFloatJsonCondition(ms.MetricName, ms.Range, query)
|
||||
}
|
||||
}
|
||||
return query
|
||||
}
|
||||
|
||||
func buildIntCondition(field string, cond *schema.IntRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
func buildIntCondition(field string, cond *config.IntRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
}
|
||||
|
||||
func buildTimeCondition(field string, cond *schema.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
}
|
||||
|
||||
func buildTimeCondition(field string, cond *config.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if cond.From != nil && cond.To != nil {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix())
|
||||
} else if cond.From != nil {
|
||||
return query.Where("? <= "+field, cond.From.Unix())
|
||||
} else if cond.To != nil {
|
||||
return query.Where(field+" <= ?", cond.To.Unix())
|
||||
} else if cond.Range != "" {
|
||||
now := time.Now().Unix()
|
||||
var then int64
|
||||
switch cond.Range {
|
||||
case "last6h":
|
||||
then = now - (60 * 60 * 6)
|
||||
case "last24h":
|
||||
then = now - (60 * 60 * 24)
|
||||
case "last7d":
|
||||
then = now - (60 * 60 * 24 * 7)
|
||||
case "last30d":
|
||||
then = now - (60 * 60 * 24 * 30)
|
||||
default:
|
||||
cclog.Debugf("No known named timeRange: startTime.range = %s", cond.Range)
|
||||
return query
|
||||
}
|
||||
return query.Where(field+" BETWEEN ? AND ?", then, now)
|
||||
} else {
|
||||
return query
|
||||
}
|
||||
}
|
||||
|
||||
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
func buildFloatJsonCondition(condName string, condRange *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
// Verify and Search Only in Valid Jsons
|
||||
query = query.Where("JSON_VALID(footprint)")
|
||||
return query.Where("JSON_EXTRACT(footprint, \"$."+condName+"\") BETWEEN ? AND ?", condRange.From, condRange.To)
|
||||
}
|
||||
|
||||
func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
@@ -227,9 +278,7 @@ func buildStringCondition(field string, cond *model.StringInput, query sq.Select
|
||||
}
|
||||
if cond.In != nil {
|
||||
queryElements := make([]string, len(cond.In))
|
||||
for i, val := range cond.In {
|
||||
queryElements[i] = val
|
||||
}
|
||||
copy(queryElements, cond.In)
|
||||
return query.Where(sq.Or{sq.Eq{field: queryElements}})
|
||||
}
|
||||
return query
|
||||
@@ -257,13 +306,37 @@ func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.
|
||||
return query
|
||||
}
|
||||
|
||||
var matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)")
|
||||
var matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
|
||||
func buildResourceJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
// Verify and Search Only in Valid Jsons
|
||||
query = query.Where("JSON_VALID(resources)")
|
||||
// add "AND" Sql query Block for field match
|
||||
if cond.Eq != nil {
|
||||
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") = ?)", *cond.Eq)
|
||||
}
|
||||
if cond.Neq != nil { // Currently Unused
|
||||
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") != ?)", *cond.Neq)
|
||||
}
|
||||
if cond.StartsWith != nil { // Currently Unused
|
||||
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\")) LIKE ?)", fmt.Sprint(*cond.StartsWith, "%"))
|
||||
}
|
||||
if cond.EndsWith != nil { // Currently Unused
|
||||
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") LIKE ?)", fmt.Sprint("%", *cond.EndsWith))
|
||||
}
|
||||
if cond.Contains != nil {
|
||||
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") LIKE ?)", fmt.Sprint("%", *cond.Contains, "%"))
|
||||
}
|
||||
return query
|
||||
}
|
||||
|
||||
var (
|
||||
matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)")
|
||||
matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
|
||||
)
|
||||
|
||||
func toSnakeCase(str string) string {
|
||||
for _, c := range str {
|
||||
if c == '\'' || c == '\\' {
|
||||
log.Panic("toSnakeCase() attack vector!")
|
||||
cclog.Panic("toSnakeCase() attack vector!")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,51 +1,65 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
func TestFind(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
jobId, cluster, startTime := int64(398998), "fritz", int64(1675957496)
|
||||
job, err := r.Find(&jobId, &cluster, &startTime)
|
||||
jobID, cluster, startTime := int64(398800), "fritz", int64(1675954712)
|
||||
job, err := r.Find(&jobID, &cluster, &startTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// fmt.Printf("%+v", job)
|
||||
|
||||
if job.ID != 5 {
|
||||
t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1366", job.JobID)
|
||||
if *job.ID != 345 {
|
||||
t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 345", job.JobID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindById(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
job, err := r.FindById(5)
|
||||
job, err := r.FindById(getContext(t), 338)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// fmt.Printf("%+v", job)
|
||||
|
||||
if job.JobID != 398998 {
|
||||
t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1404396", job.JobID)
|
||||
if job.JobID != 398793 {
|
||||
t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 1404396", job.JobID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetTags(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
tags, counts, err := r.CountTags(nil)
|
||||
const contextUserKey ContextKey = "user"
|
||||
contextUserValue := &schema.User{
|
||||
Username: "testuser",
|
||||
Projects: make([]string, 0),
|
||||
Roles: []string{"user"},
|
||||
AuthType: 0,
|
||||
AuthSource: 2,
|
||||
}
|
||||
|
||||
ctx := context.WithValue(getContext(t), contextUserKey, contextUserValue)
|
||||
|
||||
// Test Tag has Scope "global"
|
||||
tags, counts, err := r.CountTags(GetUserFromContext(ctx))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package repository
|
||||
|
||||
import (
|
||||
@@ -9,14 +10,14 @@ import (
|
||||
"embed"
|
||||
"fmt"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/golang-migrate/migrate/v4"
|
||||
"github.com/golang-migrate/migrate/v4/database/mysql"
|
||||
"github.com/golang-migrate/migrate/v4/database/sqlite3"
|
||||
"github.com/golang-migrate/migrate/v4/source/iofs"
|
||||
)
|
||||
|
||||
const Version uint = 7
|
||||
const Version uint = 10
|
||||
|
||||
//go:embed migrations/*
|
||||
var migrationFiles embed.FS
|
||||
@@ -54,13 +55,13 @@ func checkDBVersion(backend string, db *sql.DB) error {
|
||||
return err
|
||||
}
|
||||
default:
|
||||
log.Fatalf("unsupported database backend: %s", backend)
|
||||
cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
|
||||
}
|
||||
|
||||
v, dirty, err := m.Version()
|
||||
if err != nil {
|
||||
if err == migrate.ErrNilVersion {
|
||||
log.Warn("Legacy database without version or missing database file!")
|
||||
cclog.Warn("Legacy database without version or missing database file!")
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
@@ -84,7 +85,7 @@ func getMigrateInstance(backend string, db string) (m *migrate.Migrate, err erro
|
||||
case "sqlite3":
|
||||
d, err := iofs.New(migrationFiles, "migrations/sqlite3")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
|
||||
m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("sqlite3://%s?_foreign_keys=on", db))
|
||||
@@ -102,7 +103,7 @@ func getMigrateInstance(backend string, db string) (m *migrate.Migrate, err erro
|
||||
return m, err
|
||||
}
|
||||
default:
|
||||
log.Fatalf("unsupported database backend: %s", backend)
|
||||
cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
|
||||
}
|
||||
|
||||
return m, nil
|
||||
@@ -114,9 +115,26 @@ func MigrateDB(backend string, db string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
v, dirty, err := m.Version()
|
||||
if err != nil {
|
||||
if err == migrate.ErrNilVersion {
|
||||
cclog.Warn("Legacy database without version or missing database file!")
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if v < Version {
|
||||
cclog.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version)
|
||||
}
|
||||
|
||||
if dirty {
|
||||
return fmt.Errorf("last migration to version %d has failed, please fix the db manually and force version with -force-db flag", Version)
|
||||
}
|
||||
|
||||
if err := m.Up(); err != nil {
|
||||
if err == migrate.ErrNoChange {
|
||||
log.Info("DB already up to date!")
|
||||
cclog.Info("DB already up to date!")
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
@@ -134,7 +152,7 @@ func RevertDB(backend string, db string) error {
|
||||
|
||||
if err := m.Migrate(Version - 1); err != nil {
|
||||
if err == migrate.ErrNoChange {
|
||||
log.Info("DB already up to date!")
|
||||
cclog.Info("DB already up to date!")
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
ALTER TABLE job DROP energy;
|
||||
ALTER TABLE job DROP energy_footprint;
|
||||
ALTER TABLE job ADD COLUMN flops_any_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_used_max;
|
||||
ALTER TABLE job ADD COLUMN load_avg;
|
||||
ALTER TABLE job ADD COLUMN net_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN net_data_vol_total;
|
||||
ALTER TABLE job ADD COLUMN file_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN file_data_vol_total;
|
||||
|
||||
UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
|
||||
UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
|
||||
UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
|
||||
UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
|
||||
UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
|
||||
UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
|
||||
UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
|
||||
UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
|
||||
|
||||
ALTER TABLE job DROP footprint;
|
||||
-- Do not use reserved keywords anymore
|
||||
RENAME TABLE hpc_user TO `user`;
|
||||
ALTER TABLE job RENAME COLUMN hpc_user TO `user`;
|
||||
ALTER TABLE job RENAME COLUMN cluster_partition TO `partition`;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_subcluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_project;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_user;
|
||||
DROP INDEX IF EXISTS jobs_user_starttime;
|
||||
DROP INDEX IF EXISTS jobs_user_duration;
|
||||
DROP INDEX IF EXISTS jobs_user_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_project;
|
||||
DROP INDEX IF EXISTS jobs_project_user;
|
||||
DROP INDEX IF EXISTS jobs_project_starttime;
|
||||
DROP INDEX IF EXISTS jobs_project_duration;
|
||||
DROP INDEX IF EXISTS jobs_project_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_arrayjobid_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_starttime;
|
||||
DROP INDEX IF EXISTS jobs_duration;
|
||||
DROP INDEX IF EXISTS jobs_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_duration_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numnodes_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numacc_starttime;
|
||||
DROP INDEX IF EXISTS jobs_energy_starttime;
|
||||
123
internal/repository/migrations/mysql/08_add-footprint.up.sql
Normal file
123
internal/repository/migrations/mysql/08_add-footprint.up.sql
Normal file
@@ -0,0 +1,123 @@
|
||||
DROP INDEX IF EXISTS job_stats ON job;
|
||||
DROP INDEX IF EXISTS job_by_user ON job;
|
||||
DROP INDEX IF EXISTS job_by_starttime ON job;
|
||||
DROP INDEX IF EXISTS job_by_job_id ON job;
|
||||
DROP INDEX IF EXISTS job_list ON job;
|
||||
DROP INDEX IF EXISTS job_list_user ON job;
|
||||
DROP INDEX IF EXISTS job_list_users ON job;
|
||||
DROP INDEX IF EXISTS job_list_users_start ON job;
|
||||
|
||||
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
|
||||
ALTER TABLE job ADD COLUMN energy_footprint JSON;
|
||||
|
||||
ALTER TABLE job ADD COLUMN footprint JSON;
|
||||
ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global';
|
||||
|
||||
-- Do not use reserved keywords anymore
|
||||
RENAME TABLE `user` TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN `user` TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN `partition` TO cluster_partition;
|
||||
|
||||
ALTER TABLE job MODIFY COLUMN cluster VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN hpc_user VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN subcluster VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN project VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN cluster_partition VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN job_state VARCHAR(25);
|
||||
|
||||
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
|
||||
UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
|
||||
|
||||
ALTER TABLE job DROP flops_any_avg;
|
||||
ALTER TABLE job DROP mem_bw_avg;
|
||||
ALTER TABLE job DROP mem_used_max;
|
||||
ALTER TABLE job DROP load_avg;
|
||||
ALTER TABLE job DROP net_bw_avg;
|
||||
ALTER TABLE job DROP net_data_vol_total;
|
||||
ALTER TABLE job DROP file_bw_avg;
|
||||
ALTER TABLE job DROP file_data_vol_total;
|
||||
|
||||
-- Indices for: Single filters, combined filters, sorting, sorting with filters
|
||||
-- Cluster Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
|
||||
-- Cluster Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
|
||||
|
||||
-- Cluster+Partition Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
|
||||
-- Cluster+Partition Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
|
||||
|
||||
-- Cluster+Partition+Jobstate Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
|
||||
-- Cluster+Partition+Jobstate Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
|
||||
|
||||
-- Cluster+JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
|
||||
-- Cluster+JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
|
||||
|
||||
-- User Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
|
||||
-- User Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
|
||||
|
||||
-- Project Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
|
||||
-- Project Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
|
||||
|
||||
-- JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
|
||||
-- JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
|
||||
|
||||
-- ArrayJob Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
|
||||
|
||||
-- Sorting without active filters
|
||||
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
|
||||
|
||||
-- Single filters with default starttime sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
|
||||
|
||||
-- Optimize DB index usage
|
||||
103
internal/repository/migrations/sqlite3/08_add-footprint.down.sql
Normal file
103
internal/repository/migrations/sqlite3/08_add-footprint.down.sql
Normal file
@@ -0,0 +1,103 @@
|
||||
ALTER TABLE job DROP energy;
|
||||
ALTER TABLE job DROP energy_footprint;
|
||||
ALTER TABLE job ADD COLUMN flops_any_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_used_max;
|
||||
ALTER TABLE job ADD COLUMN load_avg;
|
||||
ALTER TABLE job ADD COLUMN net_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN net_data_vol_total;
|
||||
ALTER TABLE job ADD COLUMN file_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN file_data_vol_total;
|
||||
|
||||
UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
|
||||
UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
|
||||
UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
|
||||
UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
|
||||
UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
|
||||
UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
|
||||
UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
|
||||
UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
|
||||
|
||||
ALTER TABLE job DROP footprint;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_subcluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_cluster_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_cluster_numacc;
|
||||
DROP INDEX IF EXISTS jobs_cluster_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_numacc;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numacc;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_project;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_numacc;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_user;
|
||||
DROP INDEX IF EXISTS jobs_user_starttime;
|
||||
DROP INDEX IF EXISTS jobs_user_duration;
|
||||
DROP INDEX IF EXISTS jobs_user_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_user_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_user_numacc;
|
||||
DROP INDEX IF EXISTS jobs_user_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_project;
|
||||
DROP INDEX IF EXISTS jobs_project_user;
|
||||
DROP INDEX IF EXISTS jobs_project_starttime;
|
||||
DROP INDEX IF EXISTS jobs_project_duration;
|
||||
DROP INDEX IF EXISTS jobs_project_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_project_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_project_numacc;
|
||||
DROP INDEX IF EXISTS jobs_project_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_numacc;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_arrayjobid_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_starttime;
|
||||
DROP INDEX IF EXISTS jobs_duration;
|
||||
DROP INDEX IF EXISTS jobs_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_numacc;
|
||||
DROP INDEX IF EXISTS jobs_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_duration_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numnodes_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numhwthreads_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numacc_starttime;
|
||||
DROP INDEX IF EXISTS jobs_energy_starttime;
|
||||
142
internal/repository/migrations/sqlite3/08_add-footprint.up.sql
Normal file
142
internal/repository/migrations/sqlite3/08_add-footprint.up.sql
Normal file
@@ -0,0 +1,142 @@
|
||||
DROP INDEX IF EXISTS job_stats;
|
||||
DROP INDEX IF EXISTS job_by_user;
|
||||
DROP INDEX IF EXISTS job_by_starttime;
|
||||
DROP INDEX IF EXISTS job_by_job_id;
|
||||
DROP INDEX IF EXISTS job_list;
|
||||
DROP INDEX IF EXISTS job_list_user;
|
||||
DROP INDEX IF EXISTS job_list_users;
|
||||
DROP INDEX IF EXISTS job_list_users_start;
|
||||
|
||||
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
|
||||
ALTER TABLE job ADD COLUMN energy_footprint TEXT DEFAULT NULL;
|
||||
|
||||
ALTER TABLE job ADD COLUMN footprint TEXT DEFAULT NULL;
|
||||
ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global';
|
||||
|
||||
-- Do not use reserved keywords anymore
|
||||
ALTER TABLE "user" RENAME TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN "user" TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN "partition" TO cluster_partition;
|
||||
|
||||
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
|
||||
UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
|
||||
|
||||
ALTER TABLE job DROP flops_any_avg;
|
||||
ALTER TABLE job DROP mem_bw_avg;
|
||||
ALTER TABLE job DROP mem_used_max;
|
||||
ALTER TABLE job DROP load_avg;
|
||||
ALTER TABLE job DROP net_bw_avg;
|
||||
ALTER TABLE job DROP net_data_vol_total;
|
||||
ALTER TABLE job DROP file_bw_avg;
|
||||
ALTER TABLE job DROP file_data_vol_total;
|
||||
|
||||
-- Indices for: Single filters, combined filters, sorting, sorting with filters
|
||||
-- Cluster Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
|
||||
-- Cluster Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numhwthreads ON job (cluster, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numacc ON job (cluster, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_energy ON job (cluster, energy);
|
||||
|
||||
-- Cluster+Partition Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
|
||||
-- Cluster+Partition Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numhwthreads ON job (cluster, cluster_partition, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numacc ON job (cluster, cluster_partition, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_energy ON job (cluster, cluster_partition, energy);
|
||||
|
||||
-- Cluster+Partition+Jobstate Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
|
||||
-- Cluster+Partition+Jobstate Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numhwthreads ON job (cluster, cluster_partition, job_state, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numacc ON job (cluster, cluster_partition, job_state, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_energy ON job (cluster, cluster_partition, job_state, energy);
|
||||
|
||||
-- Cluster+JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
|
||||
-- Cluster+JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numhwthreads ON job (cluster, job_state, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numacc ON job (cluster, job_state, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_energy ON job (cluster, job_state, energy);
|
||||
|
||||
-- User Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
|
||||
-- User Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numhwthreads ON job (hpc_user, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numacc ON job (hpc_user, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_energy ON job (hpc_user, energy);
|
||||
|
||||
-- Project Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
|
||||
-- Project Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numhwthreads ON job (project, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numacc ON job (project, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_energy ON job (project, energy);
|
||||
|
||||
-- JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
|
||||
-- JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numhwthreads ON job (job_state, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numacc ON job (job_state, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_energy ON job (job_state, energy);
|
||||
|
||||
-- ArrayJob Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
|
||||
|
||||
-- Sorting without active filters
|
||||
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numhwthreads ON job (num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc ON job (num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy ON job (energy);
|
||||
|
||||
-- Single filters with default starttime sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numhwthreads_starttime ON job (num_hwthreads, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
|
||||
|
||||
-- Optimize DB index usage
|
||||
PRAGMA optimize;
|
||||
@@ -0,0 +1 @@
|
||||
DROP TABLE IF EXISTS job_cache;
|
||||
119
internal/repository/migrations/sqlite3/09_add-job-cache.up.sql
Normal file
119
internal/repository/migrations/sqlite3/09_add-job-cache.up.sql
Normal file
@@ -0,0 +1,119 @@
|
||||
CREATE TABLE "job_cache" (
|
||||
id INTEGER PRIMARY KEY,
|
||||
job_id BIGINT NOT NULL,
|
||||
cluster VARCHAR(255) NOT NULL,
|
||||
subcluster VARCHAR(255) NOT NULL,
|
||||
submit_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp
|
||||
start_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp
|
||||
hpc_user VARCHAR(255) NOT NULL,
|
||||
project VARCHAR(255) NOT NULL,
|
||||
cluster_partition VARCHAR(255),
|
||||
array_job_id BIGINT,
|
||||
duration INT NOT NULL,
|
||||
walltime INT NOT NULL,
|
||||
job_state VARCHAR(255) NOT NULL
|
||||
CHECK (job_state IN (
|
||||
'boot_fail', 'cancelled', 'completed', 'deadline',
|
||||
'failed', 'node_fail', 'out-of-memory', 'pending',
|
||||
'preempted', 'running', 'suspended', 'timeout'
|
||||
)),
|
||||
meta_data TEXT, -- JSON
|
||||
resources TEXT NOT NULL, -- JSON
|
||||
num_nodes INT NOT NULL,
|
||||
num_hwthreads INT,
|
||||
num_acc INT,
|
||||
smt TINYINT NOT NULL DEFAULT 1 CHECK (smt IN (0, 1)),
|
||||
shared TEXT NOT NULL
|
||||
CHECK (shared IN ("none", "single_user", "multi_user")),
|
||||
monitoring_status TINYINT NOT NULL DEFAULT 1
|
||||
CHECK (monitoring_status IN (0, 1, 2, 3)),
|
||||
energy REAL NOT NULL DEFAULT 0.0,
|
||||
energy_footprint TEXT DEFAULT NULL,
|
||||
footprint TEXT DEFAULT NULL,
|
||||
UNIQUE (job_id, cluster, start_time)
|
||||
);
|
||||
|
||||
CREATE TABLE "job_new" (
|
||||
id INTEGER PRIMARY KEY,
|
||||
job_id BIGINT NOT NULL,
|
||||
cluster TEXT NOT NULL,
|
||||
subcluster TEXT NOT NULL,
|
||||
submit_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp
|
||||
start_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp
|
||||
hpc_user TEXT NOT NULL,
|
||||
project TEXT NOT NULL,
|
||||
cluster_partition TEXT,
|
||||
array_job_id BIGINT,
|
||||
duration INT NOT NULL,
|
||||
walltime INT NOT NULL,
|
||||
job_state TEXT NOT NULL
|
||||
CHECK (job_state IN (
|
||||
'boot_fail', 'cancelled', 'completed', 'deadline',
|
||||
'failed', 'node_fail', 'out-of-memory', 'pending',
|
||||
'preempted', 'running', 'suspended', 'timeout'
|
||||
)),
|
||||
meta_data TEXT, -- JSON
|
||||
resources TEXT NOT NULL, -- JSON
|
||||
num_nodes INT NOT NULL,
|
||||
num_hwthreads INT,
|
||||
num_acc INT,
|
||||
smt INT NOT NULL DEFAULT 1,
|
||||
shared TEXT NOT NULL
|
||||
CHECK (shared IN ("none", "single_user", "multi_user")),
|
||||
monitoring_status TINYINT NOT NULL DEFAULT 1
|
||||
CHECK (monitoring_status IN (0, 1, 2, 3)),
|
||||
energy REAL NOT NULL DEFAULT 0.0,
|
||||
energy_footprint TEXT DEFAULT NULL,
|
||||
footprint TEXT DEFAULT NULL,
|
||||
UNIQUE (job_id, cluster, start_time)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS lookup_exclusive (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL UNIQUE
|
||||
);
|
||||
|
||||
INSERT INTO lookup_exclusive (id, name) VALUES
|
||||
(0, 'multi_user'),
|
||||
(1, 'none'),
|
||||
(2, 'single_user');
|
||||
|
||||
INSERT INTO job_new (
|
||||
id, job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
|
||||
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources,
|
||||
num_nodes, num_hwthreads, num_acc, smt, shared, monitoring_status, energy,
|
||||
energy_footprint, footprint
|
||||
) SELECT
|
||||
id,
|
||||
job_id,
|
||||
cluster,
|
||||
subcluster,
|
||||
0,
|
||||
start_time,
|
||||
hpc_user,
|
||||
project,
|
||||
cluster_partition,
|
||||
array_job_id,
|
||||
duration,
|
||||
walltime,
|
||||
job_state,
|
||||
meta_data,
|
||||
resources,
|
||||
num_nodes,
|
||||
num_hwthreads,
|
||||
num_acc,
|
||||
smt,
|
||||
(
|
||||
SELECT name FROM lookup_exclusive
|
||||
WHERE id = job.exclusive
|
||||
),
|
||||
monitoring_status,
|
||||
energy,
|
||||
energy_footprint,
|
||||
footprint
|
||||
FROM job;
|
||||
|
||||
DROP TABLE lookup_exclusive;
|
||||
DROP TABLE job;
|
||||
ALTER TABLE job_new RENAME TO job;
|
||||
@@ -0,0 +1 @@
|
||||
DROP TABLE IF EXISTS node;
|
||||
50
internal/repository/migrations/sqlite3/10_node-table.up.sql
Normal file
50
internal/repository/migrations/sqlite3/10_node-table.up.sql
Normal file
@@ -0,0 +1,50 @@
|
||||
CREATE TABLE "node" (
|
||||
id INTEGER PRIMARY KEY,
|
||||
hostname VARCHAR(255) NOT NULL,
|
||||
cluster VARCHAR(255) NOT NULL,
|
||||
subcluster VARCHAR(255) NOT NULL,
|
||||
meta_data TEXT, -- JSON
|
||||
UNIQUE (hostname, cluster)
|
||||
);
|
||||
|
||||
CREATE TABLE "node_state" (
|
||||
id INTEGER PRIMARY KEY,
|
||||
time_stamp INTEGER NOT NULL,
|
||||
jobs_running INTEGER DEFAULT 0 NOT NULL,
|
||||
cpus_allocated INTEGER DEFAULT 0 NOT NULL,
|
||||
memory_allocated INTEGER DEFAULT 0 NOT NULL,
|
||||
gpus_allocated INTEGER DEFAULT 0 NOT NULL,
|
||||
node_state VARCHAR(255) NOT NULL
|
||||
CHECK (node_state IN (
|
||||
'allocated', 'reserved', 'idle', 'mixed',
|
||||
'down', 'unknown'
|
||||
)),
|
||||
health_state VARCHAR(255) NOT NULL
|
||||
CHECK (health_state IN (
|
||||
'full', 'partial', 'failed'
|
||||
)),
|
||||
node_id INTEGER,
|
||||
FOREIGN KEY (node_id) REFERENCES node (id)
|
||||
);
|
||||
|
||||
-- Add NEW Indices For New Job Table Columns
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_submittime ON job (cluster, submit_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_submittime ON job (cluster, cluster_partition, submit_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_submittime ON job (
|
||||
cluster, cluster_partition, job_state, submit_time
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_submittime ON job (cluster, job_state, submit_time);
|
||||
|
||||
-- Add NEW Indices For New Node Table VARCHAR Fields
|
||||
CREATE INDEX IF NOT EXISTS nodes_cluster ON node (cluster);
|
||||
CREATE INDEX IF NOT EXISTS nodes_cluster_subcluster ON node (cluster, subcluster);
|
||||
|
||||
-- Add NEW Indices For New Node_State Table Fields
|
||||
CREATE INDEX IF NOT EXISTS nodestates_state ON node_state (node_state);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_health ON node_state (health_state);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_nodeid_state ON node_state (node_id, node_state);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_nodeid_health ON node_state (node_id, health_state);
|
||||
|
||||
-- Add NEW Indices For Increased Amounts of Tags
|
||||
CREATE INDEX IF NOT EXISTS tags_jobid ON jobtag (job_id);
|
||||
CREATE INDEX IF NOT EXISTS tags_tagid ON jobtag (tag_id);
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user