mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-23 08:07:29 +01:00
Compare commits
397 Commits
v1.3.1
...
ccfront-de
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
24d43f3540 | ||
|
|
e376f97547 | ||
|
|
f2428d3cb3 | ||
|
|
2fdac85d31 | ||
|
|
b731395689 | ||
|
|
07405e3466 | ||
|
|
c0443cbec2 | ||
|
|
633bd42036 | ||
|
|
998ef8d834 | ||
|
|
c25b076ca9 | ||
|
|
f43379f365 | ||
|
|
58e678d72c | ||
|
|
1b043838ea | ||
|
|
f7a67c72bf | ||
|
|
c5476d08fa | ||
|
|
8af92b1557 | ||
|
|
eaa826bb8a | ||
|
|
140b3c371d | ||
|
|
2bd7c8d51e | ||
|
|
1e63cdbcda | ||
|
|
dd470d49ec | ||
|
|
95d8062b00 | ||
|
5266644725
|
|||
|
81d9e96552
|
|||
|
|
4ec9f06114 | ||
|
0033e9f6c0
|
|||
|
571652c314
|
|||
|
|
7ec233e18a | ||
|
|
13c9a12336 | ||
|
|
83d472ecd6 | ||
|
|
c21da6512a | ||
|
|
4b4374e0df | ||
|
|
407276a04d | ||
|
|
64f60905b4 | ||
|
|
9e6072fed2 | ||
|
|
a3e5c424fd | ||
|
|
6683a350aa | ||
|
|
05bfa9b546 | ||
|
|
735988decb | ||
|
|
d0580592be | ||
|
|
817076bdbf | ||
|
|
736236e9ca | ||
|
|
3f4114c51b | ||
|
|
5c2c493c56 | ||
|
|
2c383ebea1 | ||
|
|
91e73450cf | ||
|
|
e55798944e | ||
|
|
5ea11a5ad2 | ||
|
|
2a3383e9e6 | ||
|
|
e871703724 | ||
|
|
1ee367d7be | ||
|
|
bce536b9b4 | ||
|
|
7c9182e0b0 | ||
|
|
aa915d639d | ||
|
|
9489ebc7d6 | ||
|
2a5c525193
|
|||
|
9e2d981c60
|
|||
|
|
53dfe9e4f5 | ||
|
48e95fbdb0
|
|||
|
fd94d85edf
|
|||
|
f2d1a85afb
|
|||
|
0bdbcb8bab
|
|||
|
|
7b91a819be | ||
| bc89025924 | |||
|
|
16bcaef4c3 | ||
|
|
fcbfa451f2 | ||
|
|
559ce53ca4 | ||
|
|
ee2c5b58d7 | ||
|
|
d98d998106 | ||
|
212c45e070
|
|||
|
143fa9b6ed
|
|||
|
4849928288
|
|||
|
|
9248ee8868 | ||
|
|
1616d96732 | ||
| 0bbedd1600 | |||
|
|
c7e49644d8 | ||
|
010c903c74
|
|||
|
e4d12e3537
|
|||
|
051cc8384e
|
|||
|
49a94170d2
|
|||
|
|
42e8e37bd4 | ||
|
|
5d2c350ce2 | ||
|
|
85dc0362c1 | ||
|
|
01c06728eb | ||
|
|
257250714d | ||
|
|
3b769c3059 | ||
|
|
a7395ed45b | ||
|
|
ab07c7928f | ||
|
|
b0c0d15505 | ||
|
|
fcf50790da | ||
|
|
1e43654607 | ||
|
|
4fecbe820d | ||
|
|
763c9dfa6b | ||
|
9de5879786
|
|||
|
|
9396e7492c | ||
|
3ac3415178
|
|||
|
1aae1c59d0
|
|||
|
907e80a01c
|
|||
|
|
8a10b69716 | ||
|
|
1a3cf7edd6 | ||
|
|
76d0fc979b | ||
|
|
a42d8ece35 | ||
|
|
93377f53fc | ||
|
|
c853d74ba0 | ||
|
|
0b9f74f4f4 | ||
|
|
5da6baf828 | ||
|
5766945006
|
|||
|
a53d473b58
|
|||
|
|
d1207ad80e | ||
|
|
e2efe71b33 | ||
|
|
2aef6ed9c0 | ||
|
|
fcb6db0603 | ||
| 01b1136316 | |||
|
|
2512fe9e75 | ||
|
|
f89b5cd2ec | ||
|
|
ab284ed208 | ||
|
|
00a578657c | ||
|
|
38ce40ae7d | ||
| e1be6c7138 | |||
| 28539e60b0 | |||
|
adb11b3ed0
|
|||
|
|
f1e6dedd44 | ||
|
|
8ea1454c06 | ||
| 81b8d578f2 | |||
|
|
16b11db39c | ||
| 0d923cc920 | |||
| c523e93564 | |||
| d588798ea1 | |||
| a11f165f2a | |||
|
|
d4f487d554 | ||
|
|
93d5a0e532 | ||
|
|
00ddc462d2 | ||
|
|
5f4a74f8ba | ||
|
|
a8eff6fbd1 | ||
|
|
baa7367ebe | ||
|
|
69f8a34aac | ||
|
|
21b3a67988 | ||
|
|
d89574ce73 | ||
| ddeac6b9d9 | |||
| 17906ec0eb | |||
|
|
311c088d3d | ||
| a2584d6083 | |||
| 35bd7739c6 | |||
| 7f43c88a39 | |||
|
|
fc1c54a141 | ||
|
|
2af111c584 | ||
| c093cca8b1 | |||
|
|
2bb1b78ba4 | ||
| 3ab26172c4 | |||
| cdd45ce88b | |||
|
210a7d3136
|
|||
|
92ec64d80f
|
|||
|
ff37f71fdb
|
|||
|
6056341525
|
|||
|
|
075612f5bd | ||
| 1a87ed8210 | |||
|
|
c05ffeb16d | ||
| ee3710c5ed | |||
| 4327c4b1f7 | |||
| 492e56a098 | |||
| f0257a2784 | |||
| ec1ead89ab | |||
|
|
ae53e87aba | ||
|
|
939dd2320a | ||
|
|
2c8b73e2e2 | ||
|
|
eabc6212ea | ||
|
|
c120d6517f | ||
|
|
597ee1dad7 | ||
|
|
c4a901504d | ||
|
|
f5cc5d07fd | ||
|
|
8a0e6c921c | ||
|
|
bf1bff9ace | ||
|
|
06f24e988f | ||
|
|
ae327f545e | ||
|
|
35012b18c5 | ||
|
|
9688bad622 | ||
|
|
447b8d3372 | ||
|
|
01102cb9b0 | ||
|
|
934d1a6114 | ||
|
|
6f74c8cb77 | ||
|
|
63b9e619a4 | ||
|
|
82e28f26d7 | ||
|
|
ca9fd96baa | ||
|
|
39b22267d6 | ||
|
|
60d7984d66 | ||
|
|
33d219d2ac | ||
|
|
85a77e05af | ||
|
|
3dfeabcec6 | ||
|
|
673fdc443c | ||
|
|
2f6e5a7648 | ||
|
|
2cbe8e9517 | ||
|
|
2f0460d6ec | ||
|
|
37f4ed7770 | ||
|
|
e3104c61cb | ||
|
|
bc434ee8cb | ||
|
|
f4102b948e | ||
|
|
ed991de11a | ||
|
|
322e161064 | ||
|
|
1adc741cc2 | ||
|
|
4eff87bbf7 | ||
|
|
fc6970d08a | ||
|
|
f616c7e1c6 | ||
|
|
89ec749172 | ||
|
|
182f0f2c64 | ||
|
|
e3681495ce | ||
|
|
37415fa261 | ||
|
|
7243dbe763 | ||
|
|
0ff5c4bedd | ||
|
|
f047f89ad5 | ||
|
|
0eb0aa1d3b | ||
|
|
6019891591 | ||
|
|
615281601c | ||
|
|
82baf5d384 | ||
|
|
6fe93ecb7e | ||
|
|
b3222f3523 | ||
|
|
3b94863521 | ||
|
|
582dc8bf46 | ||
|
|
a9868fd275 | ||
|
|
218e56576a | ||
|
|
c50e79375a | ||
|
|
dcb8308f35 | ||
|
|
183b310696 | ||
|
|
c7d0c86d52 | ||
|
|
48225662b1 | ||
|
|
f53fc088ec | ||
|
|
05517fcbcd | ||
|
|
18af51b0a4 | ||
|
|
ede3da7a87 | ||
|
|
8e3327ef6a | ||
|
|
827f6daabc | ||
|
|
2567442321 | ||
|
|
9cf5478519 | ||
|
|
e5275311c2 | ||
|
|
21e4870e4c | ||
|
|
beba7c8d2e | ||
|
|
fe35313305 | ||
|
|
d7a8bbf40b | ||
|
|
f1893c596e | ||
|
|
6367c1ab4d | ||
|
|
9579887fc4 | ||
|
|
e29be2f140 | ||
|
|
2736b5d1ef | ||
|
|
ff52fb16b6 | ||
|
|
ccbf3867e1 | ||
|
|
f0de422c6e | ||
|
|
64cc19b252 | ||
|
|
26226009f0 | ||
|
|
d10e09da02 | ||
|
|
00a2e58fee | ||
|
|
b1cb45dfe6 | ||
|
|
a2951d1f05 | ||
|
|
c0b1e97602 | ||
|
|
71621a9dc4 | ||
|
|
b3ed2afebe | ||
|
|
704620baff | ||
|
|
8feb805167 | ||
|
|
065b32755a | ||
|
|
1b5f4bff2c | ||
|
|
8e1c5a485f | ||
| 5fa6c9db35 | |||
| 5482b9be2c | |||
|
|
7400273b0a | ||
|
|
0b7cdde4a0 | ||
|
|
d5382aec4f | ||
|
|
df484dc816 | ||
|
|
7ea4086807 | ||
|
|
b04bf6a951 | ||
| 7c33dcf630 | |||
| 5e65e21f0b | |||
| 53ca38ce53 | |||
|
|
398e3c1b91 | ||
| 508978d586 | |||
| e267481f71 | |||
|
|
193bee5ac8 | ||
| f58efa2871 | |||
| 6568b6d723 | |||
|
|
4b1b34d8a7 | ||
| 39c09f8565 | |||
|
|
275a77807e | ||
|
|
6443541a79 | ||
|
|
5eb6f7d307 | ||
|
|
bce2a66177 | ||
|
|
7602641909 | ||
|
|
54f3a261c5 | ||
|
|
906bac965f | ||
|
|
4ec1de6900 | ||
|
|
8ded131666 | ||
| 47b14f932e | |||
|
|
838ebb3f69 | ||
| c459724114 | |||
| b0c9d1164d | |||
| 7c51d88501 | |||
| 5b03cf826b | |||
| f305863616 | |||
| db5809d522 | |||
|
|
83df6f015c | ||
| e7231b0e13 | |||
|
|
cff60eb51c | ||
|
f914a312f5
|
|||
| 56ebb301ca | |||
|
|
a59df12595 | ||
|
|
5cc7fc6ccb | ||
|
|
55027cb630 | ||
|
|
036eba68e1 | ||
|
|
d34e0d9348 | ||
|
|
31765ce0ef | ||
|
|
9fe7cdca92 | ||
|
|
adc3502b6b | ||
|
|
95fe369648 | ||
|
|
01845a0cb7 | ||
|
|
708eaf4178 | ||
|
|
d629a58712 | ||
|
|
90886b63d6 | ||
|
|
084f89fa32 | ||
|
|
ceb3a095d8 | ||
|
|
1758275f11 | ||
|
|
e74e506ffe | ||
|
|
599a36466a | ||
|
|
613e128cab | ||
|
|
e4f8022b7a | ||
|
|
5603c41900 | ||
| a8a27c9b51 | |||
|
|
b70de5a4be | ||
|
|
b1fd07cd30 | ||
|
|
6ab2e02fe6 | ||
|
|
5535c5780c | ||
|
|
49e0a2c055 | ||
|
5e074dad10
|
|||
|
d6a88896d0
|
|||
|
5c99f5f8bb
|
|||
|
e1faba0ff2
|
|||
|
ba2f406bc0
|
|||
|
9b6db4684a
|
|||
|
|
561fd41d5d | ||
|
|
ce9995dac7 | ||
|
|
0afaea9513 | ||
|
|
9b5c6e3164 | ||
|
|
e6ebec8c1e | ||
|
|
2551921ed6 | ||
|
|
e02575aad7 | ||
|
|
ff3502c87a | ||
|
|
017f9b2140 | ||
|
|
c80d3a6958 | ||
|
|
3ca1127685 | ||
|
|
18369da5bc | ||
|
|
e65100cdc8 | ||
|
|
6a1cb51c2f | ||
|
c4d93e492b
|
|||
|
c2f72f72ac
|
|||
|
721b6b2afa
|
|||
|
b6f011c669
|
|||
|
801607fc16
|
|||
|
01a4d33514
|
|||
|
e348ec74fd
|
|||
|
0458675608
|
|||
|
c61ffce0e9
|
|||
|
68a97dc980
|
|||
|
a07d167390
|
|||
|
|
a8721dcc69 | ||
|
|
68cf952ac6 | ||
|
|
e14d6a81fe | ||
|
|
a4912893a8 | ||
|
0adfb631ef
|
|||
|
b64ce1f67f
|
|||
|
e8e3b1595d
|
|||
|
f1427d5272
|
|||
|
|
bf6b87d65c | ||
|
|
0240997257 | ||
|
|
f1e341f0b9 | ||
|
a54acb8c42
|
|||
|
c6ede67589
|
|||
|
|
11176da5d8 | ||
|
|
0a604336c4 | ||
|
|
be9df7649f | ||
|
|
63fb923995 | ||
|
|
3afe40083d | ||
|
|
9d4767539c | ||
|
ac9bba8b5b
|
|||
|
80c46bea7f
|
|||
|
|
614f694777 | ||
|
|
1072d7b449 | ||
|
1b70596735
|
|||
|
|
61eebc9fbd | ||
|
b05909969f
|
|||
|
bd89ce7cc9
|
|||
|
130613b717
|
|||
|
b3c1f39a0e
|
|||
|
97c807cd33
|
|||
|
aede5f71ec
|
|||
|
786770f56a
|
|||
|
|
74d4f00784 | ||
|
d61c4235dc
|
|||
|
e8794b8c79
|
|||
|
552da005dc
|
|||
|
|
51452d2e68 | ||
|
5c5484b4d2
|
|||
|
|
684cb5a376 |
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -7,7 +7,7 @@ jobs:
|
||||
- name: Install Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.x
|
||||
go-version: 1.22.x
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
- name: Build, Vet & Test
|
||||
|
||||
20
.gitignore
vendored
20
.gitignore
vendored
@@ -1,19 +1,23 @@
|
||||
/cc-backend
|
||||
|
||||
/var/job-archive
|
||||
/var/*.db
|
||||
/var/machine-state
|
||||
|
||||
/.env
|
||||
/config.json
|
||||
|
||||
/var/job-archive
|
||||
/var/machine-state
|
||||
/var/job.db-shm
|
||||
/var/job.db-wal
|
||||
/var/*.db
|
||||
/var/*.txt
|
||||
|
||||
/web/frontend/public/build
|
||||
/web/frontend/node_modules
|
||||
/.vscode/*
|
||||
|
||||
/archive-migration
|
||||
/archive-manager
|
||||
var/job.db-shm
|
||||
var/job.db-wal
|
||||
|
||||
/internal/repository/testdata/job.db-shm
|
||||
/internal/repository/testdata/job.db-wal
|
||||
|
||||
/.vscode/*
|
||||
dist/
|
||||
*.db
|
||||
|
||||
@@ -34,19 +34,6 @@ builds:
|
||||
main: ./tools/archive-manager
|
||||
tags:
|
||||
- static_build
|
||||
- env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
- linux
|
||||
goarch:
|
||||
- amd64
|
||||
goamd64:
|
||||
- v3
|
||||
id: "archive-migration"
|
||||
binary: archive-migration
|
||||
main: ./tools/archive-migration
|
||||
tags:
|
||||
- static_build
|
||||
- env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
@@ -70,7 +57,7 @@ archives:
|
||||
{{- else }}{{ .Arch }}{{ end }}
|
||||
{{- if .Arm }}v{{ .Arm }}{{ end }}
|
||||
checksum:
|
||||
name_template: 'checksums.txt'
|
||||
name_template: "checksums.txt"
|
||||
snapshot:
|
||||
name_template: "{{ incpatch .Version }}-next"
|
||||
changelog:
|
||||
@@ -100,7 +87,7 @@ changelog:
|
||||
release:
|
||||
draft: false
|
||||
footer: |
|
||||
Supports job archive version 1 and database version 6.
|
||||
Supports job archive version 2 and database version 8.
|
||||
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md) for further details on breaking changes.
|
||||
|
||||
# vim: set ts=2 sw=2 tw=0 fo=cnqoj
|
||||
|
||||
35
Makefile
35
Makefile
@@ -2,7 +2,7 @@ TARGET = ./cc-backend
|
||||
VAR = ./var
|
||||
CFG = config.json .env
|
||||
FRONTEND = ./web/frontend
|
||||
VERSION = 1.3.1
|
||||
VERSION = 1.4.2
|
||||
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
||||
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
||||
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
||||
@@ -22,13 +22,23 @@ SVELTE_COMPONENTS = status \
|
||||
header
|
||||
|
||||
SVELTE_TARGETS = $(addprefix $(FRONTEND)/public/build/,$(addsuffix .js, $(SVELTE_COMPONENTS)))
|
||||
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/*.js) \
|
||||
$(wildcard $(FRONTEND)/src/filters/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/plots/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/joblist/*.svelte)
|
||||
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/*.js) \
|
||||
$(wildcard $(FRONTEND)/src/analysis/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/config/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/config/admin/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/config/user/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/*.js) \
|
||||
$(wildcard $(FRONTEND)/src/generic/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/filters/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/plots/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/joblist/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/helper/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/generic/select/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/header/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/job/*.svelte)
|
||||
|
||||
.PHONY: clean distclean test tags frontend $(TARGET)
|
||||
.PHONY: clean distclean test tags frontend swagger graphql $(TARGET)
|
||||
|
||||
.NOTPARALLEL:
|
||||
|
||||
@@ -40,6 +50,15 @@ frontend:
|
||||
$(info ===> BUILD frontend)
|
||||
cd web/frontend && npm install && npm run build
|
||||
|
||||
swagger:
|
||||
$(info ===> GENERATE swagger)
|
||||
@go run github.com/swaggo/swag/cmd/swag init -d ./internal/api,./pkg/schema -g rest.go -o ./api
|
||||
@mv ./api/docs.go ./internal/api/docs.go
|
||||
|
||||
graphql:
|
||||
$(info ===> GENERATE graphql)
|
||||
@go run github.com/99designs/gqlgen
|
||||
|
||||
clean:
|
||||
$(info ===> CLEAN)
|
||||
@go clean
|
||||
@@ -63,7 +82,7 @@ tags:
|
||||
@ctags -R
|
||||
|
||||
$(VAR):
|
||||
@mkdir $(VAR)
|
||||
@mkdir -p $(VAR)
|
||||
|
||||
config.json:
|
||||
$(info ===> Initialize config.json file)
|
||||
|
||||
@@ -65,7 +65,7 @@ cd ./cc-backend
|
||||
./startDemo.sh
|
||||
```
|
||||
|
||||
You can also try the demo using the lates release binary.
|
||||
You can also try the demo using the latest release binary.
|
||||
Create a folder and put the release binary `cc-backend` into this folder.
|
||||
Execute the following steps:
|
||||
|
||||
@@ -88,7 +88,9 @@ Analysis, Systems and Status views).
|
||||
There is a Makefile to automate the build of cc-backend. The Makefile supports
|
||||
the following targets:
|
||||
|
||||
* `make`: Initialize `var` directory and build svelte frontend and backend binary. Note that there is no proper prerequesite handling. Any change of frontend source files will result in a complete rebuild.
|
||||
* `make`: Initialize `var` directory and build svelte frontend and backend
|
||||
binary. Note that there is no proper prerequisite handling. Any change of
|
||||
frontend source files will result in a complete rebuild.
|
||||
* `make clean`: Clean go build cache and remove binary.
|
||||
* `make test`: Run the tests that are also run in the GitHub workflow setup.
|
||||
|
||||
@@ -147,8 +149,6 @@ contains Go packages that can be used by other projects.
|
||||
Additional command line helper tools.
|
||||
* [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
|
||||
Commands for getting infos about and existing job archive.
|
||||
* [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
|
||||
Tool to migrate from previous to current job archive version.
|
||||
* [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
|
||||
Tool to convert external pubkey for use in `cc-backend`.
|
||||
* [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
|
||||
|
||||
@@ -1,11 +1,46 @@
|
||||
# `cc-backend` version 1.3.1
|
||||
# `cc-backend` version 1.4.2
|
||||
|
||||
Supports job archive version 1 and database version 7.
|
||||
Supports job archive version 2 and database version 8.
|
||||
|
||||
This is a bugfix release of `cc-backend`, the API backend and frontend
|
||||
This is a small bug fix release of `cc-backend`, the API backend and frontend
|
||||
implementation of ClusterCockpit.
|
||||
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
|
||||
|
||||
## Breaking changes
|
||||
|
||||
None
|
||||
- You need to perform a database migration. Depending on your database size the
|
||||
migration might require several hours!
|
||||
- You need to adapt the `cluster.json` configuration files in the job-archive,
|
||||
add new required attributes to the metric list and after that edit
|
||||
`./job-archive/version.txt` to version 2. Only metrics that have the footprint
|
||||
attribute set can be filtered and show up in the footprint UI and polar plot.
|
||||
- Continuous scrolling is default now in all job lists. You can change this back
|
||||
to paging globally, also every user can configure to use paging or continuous
|
||||
scrolling individually.
|
||||
- Tags have a scope now. Existing tags will get global scope in the database
|
||||
migration.
|
||||
|
||||
## New features
|
||||
|
||||
- Tags have a scope now. Tags created by a basic user are only visible by that
|
||||
user. Tags created by an admin/support role can be configured to be visible by
|
||||
all users (global scope) or only be admin/support role.
|
||||
- Re-sampling support for running (requires a recent `cc-metric-store`) and
|
||||
archived jobs. This greatly speeds up loading of large or very long jobs. You
|
||||
need to add the new configuration key `enable-resampling` to the `config.json`
|
||||
file.
|
||||
- For finished jobs a total job energy is shown in the job view.
|
||||
- Continuous scrolling in job lists is default now.
|
||||
- All database queries (especially for sqlite) were optimized resulting in
|
||||
dramatically faster load times.
|
||||
- A performance and energy footprint can be freely configured on a per
|
||||
subcluster base. One can filter for footprint statistics for running and
|
||||
finished jobs.
|
||||
|
||||
## Known issues
|
||||
|
||||
- Currently energy footprint metrics of type energy are ignored for calculating
|
||||
total energy.
|
||||
- Resampling for running jobs only works with cc-metric-store
|
||||
- With energy footprint metrics of type power the unit is ignored and it is
|
||||
assumed the metric has the unit Watt.
|
||||
|
||||
@@ -18,6 +18,7 @@ type Job {
|
||||
numNodes: Int!
|
||||
numHWThreads: Int!
|
||||
numAcc: Int!
|
||||
energy: Float!
|
||||
SMT: Int!
|
||||
exclusive: Int!
|
||||
partition: String!
|
||||
@@ -27,12 +28,8 @@ type Job {
|
||||
tags: [Tag!]!
|
||||
resources: [Resource!]!
|
||||
concurrentJobs: JobLinkResultList
|
||||
|
||||
memUsedMax: Float
|
||||
flopsAnyAvg: Float
|
||||
memBwAvg: Float
|
||||
loadAvg: Float
|
||||
|
||||
footprint: [FootprintValue]
|
||||
energyFootprint: [EnergyFootprintValue]
|
||||
metaData: Any
|
||||
userData: User
|
||||
}
|
||||
@@ -45,7 +42,6 @@ type JobLink {
|
||||
type Cluster {
|
||||
name: String!
|
||||
partitions: [String!]! # Slurm partitions
|
||||
metricConfig: [MetricConfig!]!
|
||||
subClusters: [SubCluster!]! # Hardware partitions/subclusters
|
||||
}
|
||||
|
||||
@@ -61,9 +57,24 @@ type SubCluster {
|
||||
flopRateSimd: MetricValue!
|
||||
memoryBandwidth: MetricValue!
|
||||
topology: Topology!
|
||||
metricConfig: [MetricConfig!]!
|
||||
footprint: [String!]!
|
||||
}
|
||||
|
||||
type FootprintValue {
|
||||
name: String!
|
||||
stat: String!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type EnergyFootprintValue {
|
||||
hardware: String!
|
||||
metric: String!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type MetricValue {
|
||||
name: String
|
||||
unit: Unit!
|
||||
value: Float!
|
||||
}
|
||||
@@ -102,6 +113,7 @@ type MetricConfig {
|
||||
normal: Float
|
||||
caution: Float!
|
||||
alert: Float!
|
||||
lowerIsBetter: Boolean
|
||||
subClusters: [SubClusterConfig!]!
|
||||
}
|
||||
|
||||
@@ -109,6 +121,7 @@ type Tag {
|
||||
id: ID!
|
||||
type: String!
|
||||
name: String!
|
||||
scope: String!
|
||||
}
|
||||
|
||||
type Resource {
|
||||
@@ -150,9 +163,10 @@ type MetricStatistics {
|
||||
}
|
||||
|
||||
type StatsSeries {
|
||||
mean: [NullableFloat!]!
|
||||
min: [NullableFloat!]!
|
||||
max: [NullableFloat!]!
|
||||
mean: [NullableFloat!]!
|
||||
median: [NullableFloat!]!
|
||||
min: [NullableFloat!]!
|
||||
max: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type MetricFootprints {
|
||||
@@ -180,6 +194,28 @@ type NodeMetrics {
|
||||
metrics: [JobMetricWithName!]!
|
||||
}
|
||||
|
||||
type NodesResultList {
|
||||
items: [NodeMetrics!]!
|
||||
offset: Int
|
||||
limit: Int
|
||||
count: Int
|
||||
totalNodes: Int
|
||||
hasNextPage: Boolean
|
||||
}
|
||||
|
||||
type ClusterSupport {
|
||||
cluster: String!
|
||||
subClusters: [String!]!
|
||||
}
|
||||
|
||||
type GlobalMetricListItem {
|
||||
name: String!
|
||||
unit: Unit!
|
||||
scope: MetricScope!
|
||||
footprint: String
|
||||
availability: [ClusterSupport!]!
|
||||
}
|
||||
|
||||
type Count {
|
||||
name: String!
|
||||
count: Int!
|
||||
@@ -191,27 +227,34 @@ type User {
|
||||
email: String!
|
||||
}
|
||||
|
||||
input MetricStatItem {
|
||||
metricName: String!
|
||||
range: FloatRange!
|
||||
}
|
||||
|
||||
type Query {
|
||||
clusters: [Cluster!]! # List of all clusters
|
||||
tags: [Tag!]! # List of all tags
|
||||
globalMetrics: [GlobalMetricListItem!]!
|
||||
|
||||
user(username: String!): User
|
||||
allocatedNodes(cluster: String!): [Count!]!
|
||||
|
||||
job(id: ID!): Job
|
||||
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
|
||||
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!], resolution: Int): [JobMetricWithName!]!
|
||||
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
|
||||
|
||||
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
|
||||
jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate): [JobsStatistics!]!
|
||||
jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate, numDurationBins: String, numMetricBins: Int): [JobsStatistics!]!
|
||||
|
||||
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
|
||||
|
||||
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
|
||||
nodeMetricsList(cluster: String!, subCluster: String!, nodeFilter: String!, scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!, page: PageRequest, resolution: Int): NodesResultList!
|
||||
}
|
||||
|
||||
type Mutation {
|
||||
createTag(type: String!, name: String!): Tag!
|
||||
createTag(type: String!, name: String!, scope: String!): Tag!
|
||||
deleteTag(id: ID!): ID!
|
||||
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
@@ -220,7 +263,7 @@ type Mutation {
|
||||
}
|
||||
|
||||
type IntRangeOutput { from: Int!, to: Int! }
|
||||
type TimeRangeOutput { from: Time!, to: Time! }
|
||||
type TimeRangeOutput { range: String, from: Time!, to: Time! }
|
||||
|
||||
input JobFilter {
|
||||
tags: [ID!]
|
||||
@@ -232,6 +275,7 @@ input JobFilter {
|
||||
cluster: StringInput
|
||||
partition: StringInput
|
||||
duration: IntRange
|
||||
energy: FloatRange
|
||||
|
||||
minRunningFor: Int
|
||||
|
||||
@@ -241,17 +285,14 @@ input JobFilter {
|
||||
|
||||
startTime: TimeRange
|
||||
state: [JobState!]
|
||||
flopsAnyAvg: FloatRange
|
||||
memBwAvg: FloatRange
|
||||
loadAvg: FloatRange
|
||||
memUsedMax: FloatRange
|
||||
|
||||
metricStats: [MetricStatItem!]
|
||||
exclusive: Int
|
||||
node: StringInput
|
||||
}
|
||||
|
||||
input OrderByInput {
|
||||
field: String!
|
||||
type: String!,
|
||||
order: SortDirectionEnum! = ASC
|
||||
}
|
||||
|
||||
@@ -269,9 +310,13 @@ input StringInput {
|
||||
in: [String!]
|
||||
}
|
||||
|
||||
input IntRange { from: Int!, to: Int! }
|
||||
input FloatRange { from: Float!, to: Float! }
|
||||
input TimeRange { from: Time, to: Time }
|
||||
input IntRange { from: Int!, to: Int! }
|
||||
input TimeRange { range: String, from: Time, to: Time }
|
||||
|
||||
input FloatRange {
|
||||
from: Float!
|
||||
to: Float!
|
||||
}
|
||||
|
||||
type JobResultList {
|
||||
items: [Job!]!
|
||||
@@ -295,6 +340,7 @@ type HistoPoint {
|
||||
type MetricHistoPoints {
|
||||
metric: String!
|
||||
unit: String!
|
||||
stat: String
|
||||
data: [MetricHistoPoint!]
|
||||
}
|
||||
|
||||
|
||||
335
api/swagger.json
335
api/swagger.json
@@ -202,7 +202,7 @@
|
||||
"200": {
|
||||
"description": "Success message",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.DeleteJobApiResponse"
|
||||
"$ref": "#/definitions/api.DefaultJobApiResponse"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
@@ -272,7 +272,7 @@
|
||||
"200": {
|
||||
"description": "Success message",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.DeleteJobApiResponse"
|
||||
"$ref": "#/definitions/api.DefaultJobApiResponse"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
@@ -342,7 +342,7 @@
|
||||
"200": {
|
||||
"description": "Success message",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.DeleteJobApiResponse"
|
||||
"$ref": "#/definitions/api.DefaultJobApiResponse"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
@@ -487,7 +487,7 @@
|
||||
"201": {
|
||||
"description": "Job added successfully",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.StartJobApiResponse"
|
||||
"$ref": "#/definitions/api.DefaultJobApiResponse"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
@@ -581,89 +581,7 @@
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Unprocessable Entity: finding job failed: sql: no rows in result set",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/jobs/stop_job/{id}": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Job to stop is specified by database ID. Only stopTime and final state are required in request body.\nReturns full job resource information according to 'JobMeta' scheme.",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
"tags": [
|
||||
"Job add and modify"
|
||||
],
|
||||
"summary": "Marks job as completed and triggers archiving",
|
||||
"parameters": [
|
||||
{
|
||||
"type": "integer",
|
||||
"description": "Database ID of Job",
|
||||
"name": "id",
|
||||
"in": "path",
|
||||
"required": true
|
||||
},
|
||||
{
|
||||
"description": "stopTime and final state in request body",
|
||||
"name": "request",
|
||||
"in": "body",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.StopJobApiRequest"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Job resource",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/schema.JobMeta"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Bad Request",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"401": {
|
||||
"description": "Unauthorized",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"403": {
|
||||
"description": "Forbidden",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"404": {
|
||||
"description": "Resource not found",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Unprocessable Entity: finding job failed: sql: no rows in result set",
|
||||
"description": "Unprocessable Entity: job has already been stopped",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
@@ -684,7 +602,7 @@
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nIf tagged job is already finished: Tag will be written directly to respective archive files.",
|
||||
"description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
@@ -909,6 +827,72 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"/notice/": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Modifies the content of notice.txt, shown as notice box on the homepage.\nIf more than one formValue is set then only the highest priority field is used.\nOnly accessible from IPs registered with apiAllowedIPs configuration option.",
|
||||
"consumes": [
|
||||
"multipart/form-data"
|
||||
],
|
||||
"produces": [
|
||||
"text/plain"
|
||||
],
|
||||
"tags": [
|
||||
"User"
|
||||
],
|
||||
"summary": "Updates or empties the notice box content",
|
||||
"parameters": [
|
||||
{
|
||||
"type": "string",
|
||||
"description": "Priority 1: New content to display",
|
||||
"name": "new-content",
|
||||
"in": "formData"
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Success Response Message",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Bad Request",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"401": {
|
||||
"description": "Unauthorized",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"403": {
|
||||
"description": "Forbidden",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Unprocessable Entity: The user could not be updated",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/user/{id}": {
|
||||
"post": {
|
||||
"security": [
|
||||
@@ -1277,6 +1261,11 @@
|
||||
"type": "string",
|
||||
"example": "Testjob"
|
||||
},
|
||||
"scope": {
|
||||
"description": "Tag Scope for Frontend Display",
|
||||
"type": "string",
|
||||
"example": "global"
|
||||
},
|
||||
"type": {
|
||||
"description": "Tag Type",
|
||||
"type": "string",
|
||||
@@ -1284,6 +1273,14 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.DefaultJobApiResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"msg": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.DeleteJobApiRequest": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
@@ -1307,14 +1304,6 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.DeleteJobApiResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"msg": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.EditMetaRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -1401,15 +1390,6 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.StartJobApiResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"description": "Database ID of new job",
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.StopJobApiRequest": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
@@ -1418,17 +1398,14 @@
|
||||
],
|
||||
"properties": {
|
||||
"cluster": {
|
||||
"description": "Cluster of job",
|
||||
"type": "string",
|
||||
"example": "fritz"
|
||||
},
|
||||
"jobId": {
|
||||
"description": "Cluster Job ID of job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"jobState": {
|
||||
"description": "Final job state",
|
||||
"allOf": [
|
||||
{
|
||||
"$ref": "#/definitions/schema.JobState"
|
||||
@@ -1437,12 +1414,10 @@
|
||||
"example": "completed"
|
||||
},
|
||||
"startTime": {
|
||||
"description": "Start Time of job as epoch",
|
||||
"type": "integer",
|
||||
"example": 1649723812
|
||||
},
|
||||
"stopTime": {
|
||||
"description": "Stop Time of job as epoch",
|
||||
"type": "integer",
|
||||
"example": 1649763839
|
||||
}
|
||||
@@ -1487,12 +1462,10 @@
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"arrayJobId": {
|
||||
"description": "The unique identifier of an array job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"cluster": {
|
||||
"description": "The unique identifier of a cluster",
|
||||
"type": "string",
|
||||
"example": "fritz"
|
||||
},
|
||||
@@ -1500,33 +1473,39 @@
|
||||
"$ref": "#/definitions/schema.JobLinkResultList"
|
||||
},
|
||||
"duration": {
|
||||
"description": "Duration of job in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 43200
|
||||
},
|
||||
"energy": {
|
||||
"type": "number"
|
||||
},
|
||||
"energyFootprint": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"exclusive": {
|
||||
"description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user",
|
||||
"type": "integer",
|
||||
"maximum": 2,
|
||||
"minimum": 0,
|
||||
"example": 1
|
||||
},
|
||||
"flopsAnyAvg": {
|
||||
"description": "FlopsAnyAvg as Float64",
|
||||
"type": "number"
|
||||
"footprint": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"id": {
|
||||
"description": "The unique identifier of a job in the database",
|
||||
"type": "integer"
|
||||
},
|
||||
"jobId": {
|
||||
"description": "The unique identifier of a job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"jobState": {
|
||||
"description": "Final state of job",
|
||||
"enum": [
|
||||
"completed",
|
||||
"failed",
|
||||
@@ -1542,95 +1521,69 @@
|
||||
],
|
||||
"example": "completed"
|
||||
},
|
||||
"loadAvg": {
|
||||
"description": "LoadAvg as Float64",
|
||||
"type": "number"
|
||||
},
|
||||
"memBwAvg": {
|
||||
"description": "MemBwAvg as Float64",
|
||||
"type": "number"
|
||||
},
|
||||
"memUsedMax": {
|
||||
"description": "MemUsedMax as Float64",
|
||||
"type": "number"
|
||||
},
|
||||
"metaData": {
|
||||
"description": "Additional information about the job",
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"monitoringStatus": {
|
||||
"description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull",
|
||||
"type": "integer",
|
||||
"maximum": 3,
|
||||
"minimum": 0,
|
||||
"example": 1
|
||||
},
|
||||
"numAcc": {
|
||||
"description": "Number of accelerators used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 2
|
||||
},
|
||||
"numHwthreads": {
|
||||
"description": "NumCores int32 `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` // Number of HWThreads used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 20
|
||||
},
|
||||
"numNodes": {
|
||||
"description": "Number of nodes used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 2
|
||||
},
|
||||
"partition": {
|
||||
"description": "The Slurm partition to which the job was submitted",
|
||||
"type": "string",
|
||||
"example": "main"
|
||||
},
|
||||
"project": {
|
||||
"description": "The unique identifier of a project",
|
||||
"type": "string",
|
||||
"example": "abcd200"
|
||||
},
|
||||
"resources": {
|
||||
"description": "Resources used by job",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.Resource"
|
||||
}
|
||||
},
|
||||
"smt": {
|
||||
"description": "SMT threads used by job",
|
||||
"type": "integer",
|
||||
"example": 4
|
||||
},
|
||||
"startTime": {
|
||||
"description": "Start time as 'time.Time' data type",
|
||||
"type": "string"
|
||||
},
|
||||
"subCluster": {
|
||||
"description": "The unique identifier of a sub cluster",
|
||||
"type": "string",
|
||||
"example": "main"
|
||||
},
|
||||
"tags": {
|
||||
"description": "List of tags",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.Tag"
|
||||
}
|
||||
},
|
||||
"user": {
|
||||
"description": "The unique identifier of a user",
|
||||
"type": "string",
|
||||
"example": "abcd100h"
|
||||
},
|
||||
"walltime": {
|
||||
"description": "Requested walltime of job in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 86400
|
||||
@@ -1667,12 +1620,10 @@
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"arrayJobId": {
|
||||
"description": "The unique identifier of an array job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"cluster": {
|
||||
"description": "The unique identifier of a cluster",
|
||||
"type": "string",
|
||||
"example": "fritz"
|
||||
},
|
||||
@@ -1680,29 +1631,39 @@
|
||||
"$ref": "#/definitions/schema.JobLinkResultList"
|
||||
},
|
||||
"duration": {
|
||||
"description": "Duration of job in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 43200
|
||||
},
|
||||
"energy": {
|
||||
"type": "number"
|
||||
},
|
||||
"energyFootprint": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"exclusive": {
|
||||
"description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user",
|
||||
"type": "integer",
|
||||
"maximum": 2,
|
||||
"minimum": 0,
|
||||
"example": 1
|
||||
},
|
||||
"footprint": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"id": {
|
||||
"description": "The unique identifier of a job in the database",
|
||||
"type": "integer"
|
||||
},
|
||||
"jobId": {
|
||||
"description": "The unique identifier of a job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"jobState": {
|
||||
"description": "Final state of job",
|
||||
"enum": [
|
||||
"completed",
|
||||
"failed",
|
||||
@@ -1719,91 +1680,76 @@
|
||||
"example": "completed"
|
||||
},
|
||||
"metaData": {
|
||||
"description": "Additional information about the job",
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"monitoringStatus": {
|
||||
"description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull",
|
||||
"type": "integer",
|
||||
"maximum": 3,
|
||||
"minimum": 0,
|
||||
"example": 1
|
||||
},
|
||||
"numAcc": {
|
||||
"description": "Number of accelerators used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 2
|
||||
},
|
||||
"numHwthreads": {
|
||||
"description": "NumCores int32 `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` // Number of HWThreads used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 20
|
||||
},
|
||||
"numNodes": {
|
||||
"description": "Number of nodes used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 2
|
||||
},
|
||||
"partition": {
|
||||
"description": "The Slurm partition to which the job was submitted",
|
||||
"type": "string",
|
||||
"example": "main"
|
||||
},
|
||||
"project": {
|
||||
"description": "The unique identifier of a project",
|
||||
"type": "string",
|
||||
"example": "abcd200"
|
||||
},
|
||||
"resources": {
|
||||
"description": "Resources used by job",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.Resource"
|
||||
}
|
||||
},
|
||||
"smt": {
|
||||
"description": "SMT threads used by job",
|
||||
"type": "integer",
|
||||
"example": 4
|
||||
},
|
||||
"startTime": {
|
||||
"description": "Start epoch time stamp in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 1649723812
|
||||
},
|
||||
"statistics": {
|
||||
"description": "Metric statistics of job",
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/definitions/schema.JobStatistics"
|
||||
}
|
||||
},
|
||||
"subCluster": {
|
||||
"description": "The unique identifier of a sub cluster",
|
||||
"type": "string",
|
||||
"example": "main"
|
||||
},
|
||||
"tags": {
|
||||
"description": "List of tags",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.Tag"
|
||||
}
|
||||
},
|
||||
"user": {
|
||||
"description": "The unique identifier of a user",
|
||||
"type": "string",
|
||||
"example": "abcd100h"
|
||||
},
|
||||
"walltime": {
|
||||
"description": "Requested walltime of job in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 86400
|
||||
@@ -1892,6 +1838,15 @@
|
||||
"caution": {
|
||||
"type": "number"
|
||||
},
|
||||
"energy": {
|
||||
"type": "string"
|
||||
},
|
||||
"footprint": {
|
||||
"type": "string"
|
||||
},
|
||||
"lowerIsBetter": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -1969,22 +1924,18 @@
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"accelerators": {
|
||||
"description": "List of of accelerator device ids",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"configuration": {
|
||||
"description": "The configuration options of the node",
|
||||
"type": "string"
|
||||
},
|
||||
"hostname": {
|
||||
"description": "Name of the host (= node)",
|
||||
"type": "string"
|
||||
},
|
||||
"hwthreads": {
|
||||
"description": "List of OS processor ids",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
@@ -2027,6 +1978,12 @@
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"median": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"min": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
@@ -2050,15 +2007,33 @@
|
||||
"coresPerSocket": {
|
||||
"type": "integer"
|
||||
},
|
||||
"energyFootprint": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"flopRateScalar": {
|
||||
"$ref": "#/definitions/schema.MetricValue"
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"$ref": "#/definitions/schema.MetricValue"
|
||||
},
|
||||
"footprint": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"$ref": "#/definitions/schema.MetricValue"
|
||||
},
|
||||
"metricConfig": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.MetricConfig"
|
||||
}
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -2088,6 +2063,15 @@
|
||||
"caution": {
|
||||
"type": "number"
|
||||
},
|
||||
"energy": {
|
||||
"type": "string"
|
||||
},
|
||||
"footprint": {
|
||||
"type": "string"
|
||||
},
|
||||
"lowerIsBetter": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -2107,16 +2091,17 @@
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"description": "The unique DB identifier of a tag",
|
||||
"type": "integer"
|
||||
},
|
||||
"name": {
|
||||
"description": "Tag Name",
|
||||
"type": "string",
|
||||
"example": "Testjob"
|
||||
},
|
||||
"scope": {
|
||||
"type": "string",
|
||||
"example": "global"
|
||||
},
|
||||
"type": {
|
||||
"description": "Tag Type",
|
||||
"type": "string",
|
||||
"example": "Debug"
|
||||
}
|
||||
|
||||
261
api/swagger.yaml
261
api/swagger.yaml
@@ -23,11 +23,20 @@ definitions:
|
||||
description: Tag Name
|
||||
example: Testjob
|
||||
type: string
|
||||
scope:
|
||||
description: Tag Scope for Frontend Display
|
||||
example: global
|
||||
type: string
|
||||
type:
|
||||
description: Tag Type
|
||||
example: Debug
|
||||
type: string
|
||||
type: object
|
||||
api.DefaultJobApiResponse:
|
||||
properties:
|
||||
msg:
|
||||
type: string
|
||||
type: object
|
||||
api.DeleteJobApiRequest:
|
||||
properties:
|
||||
cluster:
|
||||
@@ -45,11 +54,6 @@ definitions:
|
||||
required:
|
||||
- jobId
|
||||
type: object
|
||||
api.DeleteJobApiResponse:
|
||||
properties:
|
||||
msg:
|
||||
type: string
|
||||
type: object
|
||||
api.EditMetaRequest:
|
||||
properties:
|
||||
key:
|
||||
@@ -108,33 +112,22 @@ definitions:
|
||||
scope:
|
||||
$ref: '#/definitions/schema.MetricScope'
|
||||
type: object
|
||||
api.StartJobApiResponse:
|
||||
properties:
|
||||
id:
|
||||
description: Database ID of new job
|
||||
type: integer
|
||||
type: object
|
||||
api.StopJobApiRequest:
|
||||
properties:
|
||||
cluster:
|
||||
description: Cluster of job
|
||||
example: fritz
|
||||
type: string
|
||||
jobId:
|
||||
description: Cluster Job ID of job
|
||||
example: 123000
|
||||
type: integer
|
||||
jobState:
|
||||
allOf:
|
||||
- $ref: '#/definitions/schema.JobState'
|
||||
description: Final job state
|
||||
example: completed
|
||||
startTime:
|
||||
description: Start Time of job as epoch
|
||||
example: 1649723812
|
||||
type: integer
|
||||
stopTime:
|
||||
description: Stop Time of job as epoch
|
||||
example: 1649763839
|
||||
type: integer
|
||||
required:
|
||||
@@ -167,42 +160,40 @@ definitions:
|
||||
description: Information of a HPC job.
|
||||
properties:
|
||||
arrayJobId:
|
||||
description: The unique identifier of an array job
|
||||
example: 123000
|
||||
type: integer
|
||||
cluster:
|
||||
description: The unique identifier of a cluster
|
||||
example: fritz
|
||||
type: string
|
||||
concurrentJobs:
|
||||
$ref: '#/definitions/schema.JobLinkResultList'
|
||||
duration:
|
||||
description: Duration of job in seconds (Min > 0)
|
||||
example: 43200
|
||||
minimum: 1
|
||||
type: integer
|
||||
energy:
|
||||
type: number
|
||||
energyFootprint:
|
||||
additionalProperties:
|
||||
type: number
|
||||
type: object
|
||||
exclusive:
|
||||
description: 'Specifies how nodes are shared: 0 - Shared among multiple jobs
|
||||
of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple
|
||||
jobs of same user'
|
||||
example: 1
|
||||
maximum: 2
|
||||
minimum: 0
|
||||
type: integer
|
||||
flopsAnyAvg:
|
||||
description: FlopsAnyAvg as Float64
|
||||
type: number
|
||||
footprint:
|
||||
additionalProperties:
|
||||
type: number
|
||||
type: object
|
||||
id:
|
||||
description: The unique identifier of a job in the database
|
||||
type: integer
|
||||
jobId:
|
||||
description: The unique identifier of a job
|
||||
example: 123000
|
||||
type: integer
|
||||
jobState:
|
||||
allOf:
|
||||
- $ref: '#/definitions/schema.JobState'
|
||||
description: Final state of job
|
||||
enum:
|
||||
- completed
|
||||
- failed
|
||||
@@ -211,79 +202,53 @@ definitions:
|
||||
- timeout
|
||||
- out_of_memory
|
||||
example: completed
|
||||
loadAvg:
|
||||
description: LoadAvg as Float64
|
||||
type: number
|
||||
memBwAvg:
|
||||
description: MemBwAvg as Float64
|
||||
type: number
|
||||
memUsedMax:
|
||||
description: MemUsedMax as Float64
|
||||
type: number
|
||||
metaData:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Additional information about the job
|
||||
type: object
|
||||
monitoringStatus:
|
||||
description: 'State of monitoring system during job run: 0 - Disabled, 1 -
|
||||
Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull'
|
||||
example: 1
|
||||
maximum: 3
|
||||
minimum: 0
|
||||
type: integer
|
||||
numAcc:
|
||||
description: Number of accelerators used (Min > 0)
|
||||
example: 2
|
||||
minimum: 1
|
||||
type: integer
|
||||
numHwthreads:
|
||||
description: NumCores int32 `json:"numCores" db:"num_cores"
|
||||
example:"20" minimum:"1"` //
|
||||
Number of HWThreads used (Min > 0)
|
||||
example: 20
|
||||
minimum: 1
|
||||
type: integer
|
||||
numNodes:
|
||||
description: Number of nodes used (Min > 0)
|
||||
example: 2
|
||||
minimum: 1
|
||||
type: integer
|
||||
partition:
|
||||
description: The Slurm partition to which the job was submitted
|
||||
example: main
|
||||
type: string
|
||||
project:
|
||||
description: The unique identifier of a project
|
||||
example: abcd200
|
||||
type: string
|
||||
resources:
|
||||
description: Resources used by job
|
||||
items:
|
||||
$ref: '#/definitions/schema.Resource'
|
||||
type: array
|
||||
smt:
|
||||
description: SMT threads used by job
|
||||
example: 4
|
||||
type: integer
|
||||
startTime:
|
||||
description: Start time as 'time.Time' data type
|
||||
type: string
|
||||
subCluster:
|
||||
description: The unique identifier of a sub cluster
|
||||
example: main
|
||||
type: string
|
||||
tags:
|
||||
description: List of tags
|
||||
items:
|
||||
$ref: '#/definitions/schema.Tag'
|
||||
type: array
|
||||
user:
|
||||
description: The unique identifier of a user
|
||||
example: abcd100h
|
||||
type: string
|
||||
walltime:
|
||||
description: Requested walltime of job in seconds (Min > 0)
|
||||
example: 86400
|
||||
minimum: 1
|
||||
type: integer
|
||||
@@ -308,39 +273,40 @@ definitions:
|
||||
description: Meta data information of a HPC job.
|
||||
properties:
|
||||
arrayJobId:
|
||||
description: The unique identifier of an array job
|
||||
example: 123000
|
||||
type: integer
|
||||
cluster:
|
||||
description: The unique identifier of a cluster
|
||||
example: fritz
|
||||
type: string
|
||||
concurrentJobs:
|
||||
$ref: '#/definitions/schema.JobLinkResultList'
|
||||
duration:
|
||||
description: Duration of job in seconds (Min > 0)
|
||||
example: 43200
|
||||
minimum: 1
|
||||
type: integer
|
||||
energy:
|
||||
type: number
|
||||
energyFootprint:
|
||||
additionalProperties:
|
||||
type: number
|
||||
type: object
|
||||
exclusive:
|
||||
description: 'Specifies how nodes are shared: 0 - Shared among multiple jobs
|
||||
of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple
|
||||
jobs of same user'
|
||||
example: 1
|
||||
maximum: 2
|
||||
minimum: 0
|
||||
type: integer
|
||||
footprint:
|
||||
additionalProperties:
|
||||
type: number
|
||||
type: object
|
||||
id:
|
||||
description: The unique identifier of a job in the database
|
||||
type: integer
|
||||
jobId:
|
||||
description: The unique identifier of a job
|
||||
example: 123000
|
||||
type: integer
|
||||
jobState:
|
||||
allOf:
|
||||
- $ref: '#/definitions/schema.JobState'
|
||||
description: Final state of job
|
||||
enum:
|
||||
- completed
|
||||
- failed
|
||||
@@ -352,74 +318,56 @@ definitions:
|
||||
metaData:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Additional information about the job
|
||||
type: object
|
||||
monitoringStatus:
|
||||
description: 'State of monitoring system during job run: 0 - Disabled, 1 -
|
||||
Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull'
|
||||
example: 1
|
||||
maximum: 3
|
||||
minimum: 0
|
||||
type: integer
|
||||
numAcc:
|
||||
description: Number of accelerators used (Min > 0)
|
||||
example: 2
|
||||
minimum: 1
|
||||
type: integer
|
||||
numHwthreads:
|
||||
description: NumCores int32 `json:"numCores" db:"num_cores"
|
||||
example:"20" minimum:"1"` //
|
||||
Number of HWThreads used (Min > 0)
|
||||
example: 20
|
||||
minimum: 1
|
||||
type: integer
|
||||
numNodes:
|
||||
description: Number of nodes used (Min > 0)
|
||||
example: 2
|
||||
minimum: 1
|
||||
type: integer
|
||||
partition:
|
||||
description: The Slurm partition to which the job was submitted
|
||||
example: main
|
||||
type: string
|
||||
project:
|
||||
description: The unique identifier of a project
|
||||
example: abcd200
|
||||
type: string
|
||||
resources:
|
||||
description: Resources used by job
|
||||
items:
|
||||
$ref: '#/definitions/schema.Resource'
|
||||
type: array
|
||||
smt:
|
||||
description: SMT threads used by job
|
||||
example: 4
|
||||
type: integer
|
||||
startTime:
|
||||
description: Start epoch time stamp in seconds (Min > 0)
|
||||
example: 1649723812
|
||||
minimum: 1
|
||||
type: integer
|
||||
statistics:
|
||||
additionalProperties:
|
||||
$ref: '#/definitions/schema.JobStatistics'
|
||||
description: Metric statistics of job
|
||||
type: object
|
||||
subCluster:
|
||||
description: The unique identifier of a sub cluster
|
||||
example: main
|
||||
type: string
|
||||
tags:
|
||||
description: List of tags
|
||||
items:
|
||||
$ref: '#/definitions/schema.Tag'
|
||||
type: array
|
||||
user:
|
||||
description: The unique identifier of a user
|
||||
example: abcd100h
|
||||
type: string
|
||||
walltime:
|
||||
description: Requested walltime of job in seconds (Min > 0)
|
||||
example: 86400
|
||||
minimum: 1
|
||||
type: integer
|
||||
@@ -486,6 +434,12 @@ definitions:
|
||||
type: number
|
||||
caution:
|
||||
type: number
|
||||
energy:
|
||||
type: string
|
||||
footprint:
|
||||
type: string
|
||||
lowerIsBetter:
|
||||
type: boolean
|
||||
name:
|
||||
type: string
|
||||
normal:
|
||||
@@ -541,18 +495,14 @@ definitions:
|
||||
description: A resource used by a job
|
||||
properties:
|
||||
accelerators:
|
||||
description: List of of accelerator device ids
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
configuration:
|
||||
description: The configuration options of the node
|
||||
type: string
|
||||
hostname:
|
||||
description: Name of the host (= node)
|
||||
type: string
|
||||
hwthreads:
|
||||
description: List of OS processor ids
|
||||
items:
|
||||
type: integer
|
||||
type: array
|
||||
@@ -580,6 +530,10 @@ definitions:
|
||||
items:
|
||||
type: number
|
||||
type: array
|
||||
median:
|
||||
items:
|
||||
type: number
|
||||
type: array
|
||||
min:
|
||||
items:
|
||||
type: number
|
||||
@@ -595,12 +549,24 @@ definitions:
|
||||
properties:
|
||||
coresPerSocket:
|
||||
type: integer
|
||||
energyFootprint:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
flopRateScalar:
|
||||
$ref: '#/definitions/schema.MetricValue'
|
||||
flopRateSimd:
|
||||
$ref: '#/definitions/schema.MetricValue'
|
||||
footprint:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
memoryBandwidth:
|
||||
$ref: '#/definitions/schema.MetricValue'
|
||||
metricConfig:
|
||||
items:
|
||||
$ref: '#/definitions/schema.MetricConfig'
|
||||
type: array
|
||||
name:
|
||||
type: string
|
||||
nodes:
|
||||
@@ -620,6 +586,12 @@ definitions:
|
||||
type: number
|
||||
caution:
|
||||
type: number
|
||||
energy:
|
||||
type: string
|
||||
footprint:
|
||||
type: string
|
||||
lowerIsBetter:
|
||||
type: boolean
|
||||
name:
|
||||
type: string
|
||||
normal:
|
||||
@@ -633,14 +605,14 @@ definitions:
|
||||
description: Defines a tag using name and type.
|
||||
properties:
|
||||
id:
|
||||
description: The unique DB identifier of a tag
|
||||
type: integer
|
||||
name:
|
||||
description: Tag Name
|
||||
example: Testjob
|
||||
type: string
|
||||
scope:
|
||||
example: global
|
||||
type: string
|
||||
type:
|
||||
description: Tag Type
|
||||
example: Debug
|
||||
type: string
|
||||
type: object
|
||||
@@ -929,7 +901,7 @@ paths:
|
||||
"200":
|
||||
description: Success message
|
||||
schema:
|
||||
$ref: '#/definitions/api.DeleteJobApiResponse'
|
||||
$ref: '#/definitions/api.DefaultJobApiResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
@@ -976,7 +948,7 @@ paths:
|
||||
"200":
|
||||
description: Success message
|
||||
schema:
|
||||
$ref: '#/definitions/api.DeleteJobApiResponse'
|
||||
$ref: '#/definitions/api.DefaultJobApiResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
@@ -1023,7 +995,7 @@ paths:
|
||||
"200":
|
||||
description: Success message
|
||||
schema:
|
||||
$ref: '#/definitions/api.DeleteJobApiResponse'
|
||||
$ref: '#/definitions/api.DefaultJobApiResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
@@ -1121,7 +1093,7 @@ paths:
|
||||
"201":
|
||||
description: Job added successfully
|
||||
schema:
|
||||
$ref: '#/definitions/api.StartJobApiResponse'
|
||||
$ref: '#/definitions/api.DefaultJobApiResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
@@ -1184,64 +1156,7 @@ paths:
|
||||
schema:
|
||||
$ref: '#/definitions/api.ErrorResponse'
|
||||
"422":
|
||||
description: 'Unprocessable Entity: finding job failed: sql: no rows in
|
||||
result set'
|
||||
schema:
|
||||
$ref: '#/definitions/api.ErrorResponse'
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
$ref: '#/definitions/api.ErrorResponse'
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Marks job as completed and triggers archiving
|
||||
tags:
|
||||
- Job add and modify
|
||||
/jobs/stop_job/{id}:
|
||||
post:
|
||||
consumes:
|
||||
- application/json
|
||||
description: |-
|
||||
Job to stop is specified by database ID. Only stopTime and final state are required in request body.
|
||||
Returns full job resource information according to 'JobMeta' scheme.
|
||||
parameters:
|
||||
- description: Database ID of Job
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
type: integer
|
||||
- description: stopTime and final state in request body
|
||||
in: body
|
||||
name: request
|
||||
required: true
|
||||
schema:
|
||||
$ref: '#/definitions/api.StopJobApiRequest'
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Job resource
|
||||
schema:
|
||||
$ref: '#/definitions/schema.JobMeta'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
$ref: '#/definitions/api.ErrorResponse'
|
||||
"401":
|
||||
description: Unauthorized
|
||||
schema:
|
||||
$ref: '#/definitions/api.ErrorResponse'
|
||||
"403":
|
||||
description: Forbidden
|
||||
schema:
|
||||
$ref: '#/definitions/api.ErrorResponse'
|
||||
"404":
|
||||
description: Resource not found
|
||||
schema:
|
||||
$ref: '#/definitions/api.ErrorResponse'
|
||||
"422":
|
||||
description: 'Unprocessable Entity: finding job failed: sql: no rows in
|
||||
result set'
|
||||
description: 'Unprocessable Entity: job has already been stopped'
|
||||
schema:
|
||||
$ref: '#/definitions/api.ErrorResponse'
|
||||
"500":
|
||||
@@ -1259,6 +1174,7 @@ paths:
|
||||
- application/json
|
||||
description: |-
|
||||
Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.
|
||||
Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username.
|
||||
If tagged job is already finished: Tag will be written directly to respective archive files.
|
||||
parameters:
|
||||
- description: Job Database ID
|
||||
@@ -1302,6 +1218,51 @@ paths:
|
||||
summary: Adds one or more tags to a job
|
||||
tags:
|
||||
- Job add and modify
|
||||
/notice/:
|
||||
post:
|
||||
consumes:
|
||||
- multipart/form-data
|
||||
description: |-
|
||||
Modifies the content of notice.txt, shown as notice box on the homepage.
|
||||
If more than one formValue is set then only the highest priority field is used.
|
||||
Only accessible from IPs registered with apiAllowedIPs configuration option.
|
||||
parameters:
|
||||
- description: 'Priority 1: New content to display'
|
||||
in: formData
|
||||
name: new-content
|
||||
type: string
|
||||
produces:
|
||||
- text/plain
|
||||
responses:
|
||||
"200":
|
||||
description: Success Response Message
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
type: string
|
||||
"401":
|
||||
description: Unauthorized
|
||||
schema:
|
||||
type: string
|
||||
"403":
|
||||
description: Forbidden
|
||||
schema:
|
||||
type: string
|
||||
"422":
|
||||
description: 'Unprocessable Entity: The user could not be updated'
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Updates or empties the notice box content
|
||||
tags:
|
||||
- User
|
||||
/user/{id}:
|
||||
post:
|
||||
consumes:
|
||||
|
||||
33
cmd/cc-backend/cli.go
Normal file
33
cmd/cc-backend/cli.go
Normal file
@@ -0,0 +1,33 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import "flag"
|
||||
|
||||
var (
|
||||
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool
|
||||
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
|
||||
)
|
||||
|
||||
func cliInit() {
|
||||
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize swlite database file, config.json and .env")
|
||||
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
|
||||
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'hpc_user' table with ldap")
|
||||
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
|
||||
flag.BoolVar(&flagGops, "gops", false, "Listen via github.com/google/gops/agent (for debugging)")
|
||||
flag.BoolVar(&flagDev, "dev", false, "Enable development components: GraphQL Playground and Swagger UI")
|
||||
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
|
||||
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
|
||||
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
|
||||
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
|
||||
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
|
||||
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
|
||||
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: `<username>:[admin,support,manager,api,user]:<password>`")
|
||||
flag.StringVar(&flagDelUser, "del-user", "", "Remove user by `username`")
|
||||
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
|
||||
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
|
||||
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug,info,warn (default),err,fatal,crit]`")
|
||||
flag.Parse()
|
||||
}
|
||||
85
cmd/cc-backend/init.go
Normal file
85
cmd/cc-backend/init.go
Normal file
@@ -0,0 +1,85 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/util"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
)
|
||||
|
||||
const envString = `
|
||||
# Base64 encoded Ed25519 keys (DO NOT USE THESE TWO IN PRODUCTION!)
|
||||
# You can generate your own keypair using the gen-keypair tool
|
||||
JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
|
||||
JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
|
||||
|
||||
# Some random bytes used as secret for cookie-based sessions (DO NOT USE THIS ONE IN PRODUCTION)
|
||||
SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
|
||||
`
|
||||
|
||||
const configString = `
|
||||
{
|
||||
"addr": "127.0.0.1:8080",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "name",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2023-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
`
|
||||
|
||||
func initEnv() {
|
||||
if util.CheckFileExists("var") {
|
||||
fmt.Print("Directory ./var already exists. Exiting!\n")
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
if err := os.WriteFile("config.json", []byte(configString), 0o666); err != nil {
|
||||
log.Fatalf("Writing config.json failed: %s", err.Error())
|
||||
}
|
||||
|
||||
if err := os.WriteFile(".env", []byte(envString), 0o666); err != nil {
|
||||
log.Fatalf("Writing .env failed: %s", err.Error())
|
||||
}
|
||||
|
||||
if err := os.Mkdir("var", 0o777); err != nil {
|
||||
log.Fatalf("Mkdir var failed: %s", err.Error())
|
||||
}
|
||||
|
||||
err := repository.MigrateDB("sqlite3", "./var/job.db")
|
||||
if err != nil {
|
||||
log.Fatalf("Initialize job.db failed: %s", err.Error())
|
||||
}
|
||||
}
|
||||
@@ -5,158 +5,48 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql/handler"
|
||||
"github.com/99designs/gqlgen/graphql/playground"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/runtimeEnv"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/util"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/taskManager"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/web"
|
||||
"github.com/go-co-op/gocron"
|
||||
"github.com/google/gops/agent"
|
||||
"github.com/gorilla/handlers"
|
||||
"github.com/gorilla/mux"
|
||||
httpSwagger "github.com/swaggo/http-swagger"
|
||||
|
||||
_ "github.com/go-sql-driver/mysql"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
const logoString = `
|
||||
____ _ _ ____ _ _ _
|
||||
/ ___| |_ _ ___| |_ ___ _ __ / ___|___ ___| | ___ __ (_) |_
|
||||
_____ _ _ ____ _ _ _
|
||||
/ ___| |_ _ ___| |_ ___ _ __ / ___|___ ___| | ___ __ (_) |_
|
||||
| | | | | | / __| __/ _ \ '__| | / _ \ / __| |/ / '_ \| | __|
|
||||
| |___| | |_| \__ \ || __/ | | |__| (_) | (__| <| |_) | | |_
|
||||
\____|_|\__,_|___/\__\___|_| \____\___/ \___|_|\_\ .__/|_|\__|
|
||||
\_____|_|\__,_|___/\__\___|_| \____\___/ \___|_|\_\ .__/|_|\__|
|
||||
|_|
|
||||
`
|
||||
|
||||
const envString = `
|
||||
# Base64 encoded Ed25519 keys (DO NOT USE THESE TWO IN PRODUCTION!)
|
||||
# You can generate your own keypair using the gen-keypair tool
|
||||
JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
|
||||
JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
|
||||
|
||||
# Some random bytes used as secret for cookie-based sessions (DO NOT USE THIS ONE IN PRODUCTION)
|
||||
SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
|
||||
`
|
||||
|
||||
const configString = `
|
||||
{
|
||||
"addr": "127.0.0.1:8080",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "name",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2023-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
`
|
||||
|
||||
var (
|
||||
date string
|
||||
commit string
|
||||
version string
|
||||
)
|
||||
|
||||
func initEnv() {
|
||||
if util.CheckFileExists("var") {
|
||||
fmt.Print("Directory ./var already exists. Exiting!\n")
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
if err := os.WriteFile("config.json", []byte(configString), 0o666); err != nil {
|
||||
log.Fatalf("Writing config.json failed: %s", err.Error())
|
||||
}
|
||||
|
||||
if err := os.WriteFile(".env", []byte(envString), 0o666); err != nil {
|
||||
log.Fatalf("Writing .env failed: %s", err.Error())
|
||||
}
|
||||
|
||||
if err := os.Mkdir("var", 0o777); err != nil {
|
||||
log.Fatalf("Mkdir var failed: %s", err.Error())
|
||||
}
|
||||
|
||||
err := repository.MigrateDB("sqlite3", "./var/job.db")
|
||||
if err != nil {
|
||||
log.Fatalf("Initialize job.db failed: %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
var flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool
|
||||
var flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
|
||||
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize swlite database file, config.json and .env")
|
||||
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
|
||||
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'user' table with ldap")
|
||||
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
|
||||
flag.BoolVar(&flagGops, "gops", false, "Listen via github.com/google/gops/agent (for debugging)")
|
||||
flag.BoolVar(&flagDev, "dev", false, "Enable development components: GraphQL Playground and Swagger UI")
|
||||
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
|
||||
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
|
||||
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
|
||||
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
|
||||
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
|
||||
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
|
||||
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: `<username>:[admin,support,manager,api,user]:<password>`")
|
||||
flag.StringVar(&flagDelUser, "del-user", "", "Remove user by `username`")
|
||||
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
|
||||
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
|
||||
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug,info,warn (default),err,fatal,crit]`")
|
||||
flag.Parse()
|
||||
cliInit()
|
||||
|
||||
if flagVersion {
|
||||
fmt.Print(logoString)
|
||||
@@ -171,14 +61,6 @@ func main() {
|
||||
// Apply config flags for pkg/log
|
||||
log.Init(flagLogLevel, flagLogDateTime)
|
||||
|
||||
if flagInit {
|
||||
initEnv()
|
||||
fmt.Print("Succesfully setup environment!\n")
|
||||
fmt.Print("Please review config.json and .env and adjust it to your needs.\n")
|
||||
fmt.Print("Add your job-archive at ./var/job-archive.\n")
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
// See https://github.com/google/gops (Runtime overhead is almost zero)
|
||||
if flagGops {
|
||||
if err := agent.Listen(agent.Options{}); err != nil {
|
||||
@@ -227,18 +109,18 @@ func main() {
|
||||
}
|
||||
|
||||
repository.Connect(config.Keys.DBDriver, config.Keys.DB)
|
||||
db := repository.GetConnection()
|
||||
|
||||
var authentication *auth.Authentication
|
||||
if flagInit {
|
||||
initEnv()
|
||||
fmt.Print("Successfully setup environment!\n")
|
||||
fmt.Print("Please review config.json and .env and adjust it to your needs.\n")
|
||||
fmt.Print("Add your job-archive at ./var/job-archive.\n")
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
var err error
|
||||
if authentication, err = auth.Init(); err != nil {
|
||||
log.Fatalf("auth initialization failed: %v", err)
|
||||
}
|
||||
|
||||
if d, err := time.ParseDuration(config.Keys.SessionMaxAge); err != nil {
|
||||
authentication.SessionMaxAge = d
|
||||
}
|
||||
auth.Init()
|
||||
|
||||
if flagNewUser != "" {
|
||||
parts := strings.SplitN(flagNewUser, ":", 3)
|
||||
@@ -260,12 +142,14 @@ func main() {
|
||||
}
|
||||
}
|
||||
|
||||
authHandle := auth.GetAuthInstance()
|
||||
|
||||
if flagSyncLDAP {
|
||||
if authentication.LdapAuth == nil {
|
||||
if authHandle.LdapAuth == nil {
|
||||
log.Fatal("cannot sync: LDAP authentication is not configured")
|
||||
}
|
||||
|
||||
if err := authentication.LdapAuth.Sync(); err != nil {
|
||||
if err := authHandle.LdapAuth.Sync(); err != nil {
|
||||
log.Fatalf("LDAP sync failed: %v", err)
|
||||
}
|
||||
log.Info("LDAP sync successfull")
|
||||
@@ -282,7 +166,7 @@ func main() {
|
||||
log.Warnf("user '%s' does not have the API role", user.Username)
|
||||
}
|
||||
|
||||
jwt, err := authentication.JwtAuth.ProvideJWT(user)
|
||||
jwt, err := authHandle.JwtAuth.ProvideJWT(user)
|
||||
if err != nil {
|
||||
log.Fatalf("failed to provide JWT to user '%s': %v", user.Username, err)
|
||||
}
|
||||
@@ -298,7 +182,7 @@ func main() {
|
||||
log.Fatalf("failed to initialize archive: %s", err.Error())
|
||||
}
|
||||
|
||||
if err := metricdata.Init(config.Keys.DisableArchive); err != nil {
|
||||
if err := metricdata.Init(); err != nil {
|
||||
log.Fatalf("failed to initialize metricdata repository: %s", err.Error())
|
||||
}
|
||||
|
||||
@@ -318,228 +202,16 @@ func main() {
|
||||
return
|
||||
}
|
||||
|
||||
// Setup the http.Handler/Router used by the server
|
||||
jobRepo := repository.GetJobRepository()
|
||||
resolver := &graph.Resolver{DB: db.DB, Repo: jobRepo}
|
||||
graphQLEndpoint := handler.NewDefaultServer(generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
|
||||
if os.Getenv("DEBUG") != "1" {
|
||||
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
|
||||
// The problem with this is that then, no more stacktrace is printed to stderr.
|
||||
graphQLEndpoint.SetRecoverFunc(func(ctx context.Context, err interface{}) error {
|
||||
switch e := err.(type) {
|
||||
case string:
|
||||
return fmt.Errorf("MAIN > Panic: %s", e)
|
||||
case error:
|
||||
return fmt.Errorf("MAIN > Panic caused by: %w", e)
|
||||
}
|
||||
|
||||
return errors.New("MAIN > Internal server error (panic)")
|
||||
})
|
||||
}
|
||||
|
||||
api := &api.RestApi{
|
||||
JobRepository: jobRepo,
|
||||
Resolver: resolver,
|
||||
MachineStateDir: config.Keys.MachineStateDir,
|
||||
Authentication: authentication,
|
||||
}
|
||||
|
||||
r := mux.NewRouter()
|
||||
buildInfo := web.Build{Version: version, Hash: commit, Buildtime: date}
|
||||
|
||||
info := map[string]interface{}{}
|
||||
info["hasOpenIDConnect"] = false
|
||||
|
||||
if config.Keys.OpenIDConfig != nil {
|
||||
openIDConnect := auth.NewOIDC(authentication)
|
||||
openIDConnect.RegisterEndpoints(r)
|
||||
info["hasOpenIDConnect"] = true
|
||||
}
|
||||
|
||||
r.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
log.Debugf("##%v##", info)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
|
||||
}).Methods(http.MethodGet)
|
||||
r.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
|
||||
})
|
||||
r.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
|
||||
})
|
||||
|
||||
secured := r.PathPrefix("/").Subrouter()
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
r.Handle("/login", authentication.Login(
|
||||
// On success:
|
||||
http.RedirectHandler("/", http.StatusTemporaryRedirect),
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Login failed - ClusterCockpit",
|
||||
MsgType: "alert-warning",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
})).Methods(http.MethodPost)
|
||||
|
||||
r.Handle("/jwt-login", authentication.Login(
|
||||
// On success:
|
||||
http.RedirectHandler("/", http.StatusTemporaryRedirect),
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Login failed - ClusterCockpit",
|
||||
MsgType: "alert-warning",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))
|
||||
|
||||
r.Handle("/logout", authentication.Logout(
|
||||
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Bye - ClusterCockpit",
|
||||
MsgType: "alert-info",
|
||||
Message: "Logout successful",
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))).Methods(http.MethodPost)
|
||||
|
||||
secured.Use(func(next http.Handler) http.Handler {
|
||||
return authentication.Auth(
|
||||
// On success;
|
||||
next,
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Authentication failed - ClusterCockpit",
|
||||
MsgType: "alert-danger",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
if flagDev {
|
||||
r.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
|
||||
r.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
|
||||
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
|
||||
}
|
||||
secured.Handle("/query", graphQLEndpoint)
|
||||
|
||||
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
|
||||
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
|
||||
routerConfig.HandleSearchBar(rw, r, buildInfo)
|
||||
})
|
||||
|
||||
// Mount all /monitoring/... and /api/... routes.
|
||||
routerConfig.SetupRoutes(secured, buildInfo)
|
||||
api.MountRoutes(secured)
|
||||
|
||||
if config.Keys.EmbedStaticFiles {
|
||||
if i, err := os.Stat("./var/img"); err == nil {
|
||||
if i.IsDir() {
|
||||
log.Info("Use local directory for static images")
|
||||
r.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
|
||||
}
|
||||
}
|
||||
r.PathPrefix("/").Handler(web.ServeFiles())
|
||||
} else {
|
||||
r.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
|
||||
}
|
||||
|
||||
r.Use(handlers.CompressHandler)
|
||||
r.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
|
||||
r.Use(handlers.CORS(
|
||||
handlers.AllowCredentials(),
|
||||
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
|
||||
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
|
||||
handlers.AllowedOrigins([]string{"*"})))
|
||||
handler := handlers.CustomLoggingHandler(io.Discard, r, func(_ io.Writer, params handlers.LogFormatterParams) {
|
||||
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
|
||||
log.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
} else {
|
||||
log.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
}
|
||||
})
|
||||
archiver.Start(repository.GetJobRepository())
|
||||
taskManager.Start()
|
||||
serverInit()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
server := http.Server{
|
||||
ReadTimeout: 10 * time.Second,
|
||||
WriteTimeout: 10 * time.Second,
|
||||
Handler: handler,
|
||||
Addr: config.Keys.Addr,
|
||||
}
|
||||
|
||||
// Start http or https server
|
||||
listener, err := net.Listen("tcp", config.Keys.Addr)
|
||||
if err != nil {
|
||||
log.Fatalf("starting http listener failed: %v", err)
|
||||
}
|
||||
|
||||
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHttpTo != "" {
|
||||
go func() {
|
||||
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHttpTo, http.StatusMovedPermanently))
|
||||
}()
|
||||
}
|
||||
|
||||
if config.Keys.HttpsCertFile != "" && config.Keys.HttpsKeyFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(config.Keys.HttpsCertFile, config.Keys.HttpsKeyFile)
|
||||
if err != nil {
|
||||
log.Fatalf("loading X509 keypair failed: %v", err)
|
||||
}
|
||||
listener = tls.NewListener(listener, &tls.Config{
|
||||
Certificates: []tls.Certificate{cert},
|
||||
CipherSuites: []uint16{
|
||||
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
|
||||
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
|
||||
},
|
||||
MinVersion: tls.VersionTLS12,
|
||||
PreferServerCipherSuites: true,
|
||||
})
|
||||
fmt.Printf("HTTPS server listening at %s...", config.Keys.Addr)
|
||||
} else {
|
||||
fmt.Printf("HTTP server listening at %s...", config.Keys.Addr)
|
||||
}
|
||||
|
||||
// Because this program will want to bind to a privileged port (like 80), the listener must
|
||||
// be established first, then the user can be changed, and after that,
|
||||
// the actual http server can be started.
|
||||
if err = runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
|
||||
log.Fatalf("error while preparing server start: %s", err.Error())
|
||||
}
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
if err = server.Serve(listener); err != nil && err != http.ErrServerClosed {
|
||||
log.Fatalf("starting server failed: %v", err)
|
||||
}
|
||||
serverStart()
|
||||
}()
|
||||
|
||||
wg.Add(1)
|
||||
@@ -550,117 +222,15 @@ func main() {
|
||||
<-sigs
|
||||
runtimeEnv.SystemdNotifiy(false, "Shutting down ...")
|
||||
|
||||
// First shut down the server gracefully (waiting for all ongoing requests)
|
||||
server.Shutdown(context.Background())
|
||||
serverShutdown()
|
||||
|
||||
// Then, wait for any async archivings still pending...
|
||||
api.JobRepository.WaitForArchiving()
|
||||
taskManager.Shutdown()
|
||||
}()
|
||||
|
||||
s := gocron.NewScheduler(time.Local)
|
||||
|
||||
if config.Keys.StopJobsExceedingWalltime > 0 {
|
||||
log.Info("Register undead jobs service")
|
||||
|
||||
s.Every(1).Day().At("3:00").Do(func() {
|
||||
err = jobRepo.StopJobsExceedingWalltimeBy(config.Keys.StopJobsExceedingWalltime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for jobs exceeding their walltime: %s", err.Error())
|
||||
}
|
||||
runtime.GC()
|
||||
})
|
||||
}
|
||||
|
||||
var cfg struct {
|
||||
Retention schema.Retention `json:"retention"`
|
||||
Compression int `json:"compression"`
|
||||
}
|
||||
|
||||
cfg.Retention.IncludeDB = true
|
||||
|
||||
if err = json.Unmarshal(config.Keys.Archive, &cfg); err != nil {
|
||||
log.Warn("Error while unmarshaling raw config json")
|
||||
}
|
||||
|
||||
switch cfg.Retention.Policy {
|
||||
case "delete":
|
||||
log.Info("Register retention delete service")
|
||||
|
||||
s.Every(1).Day().At("4:00").Do(func() {
|
||||
startTime := time.Now().Unix() - int64(cfg.Retention.Age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().CleanUp(jobs)
|
||||
|
||||
if cfg.Retention.IncludeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime)
|
||||
if err != nil {
|
||||
log.Errorf("Error while deleting retention jobs from db: %s", err.Error())
|
||||
} else {
|
||||
log.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
log.Errorf("Error occured in db optimization: %s", err.Error())
|
||||
}
|
||||
}
|
||||
})
|
||||
case "move":
|
||||
log.Info("Register retention move service")
|
||||
|
||||
s.Every(1).Day().At("4:00").Do(func() {
|
||||
startTime := time.Now().Unix() - int64(cfg.Retention.Age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().Move(jobs, cfg.Retention.Location)
|
||||
|
||||
if cfg.Retention.IncludeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime)
|
||||
if err != nil {
|
||||
log.Errorf("Error while deleting retention jobs from db: %v", err)
|
||||
} else {
|
||||
log.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
log.Errorf("Error occured in db optimization: %v", err)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if cfg.Compression > 0 {
|
||||
log.Info("Register compression service")
|
||||
|
||||
s.Every(1).Day().At("5:00").Do(func() {
|
||||
var jobs []*schema.Job
|
||||
|
||||
ar := archive.GetHandle()
|
||||
startTime := time.Now().Unix() - int64(cfg.Compression*24*3600)
|
||||
lastTime := ar.CompressLast(startTime)
|
||||
if startTime == lastTime {
|
||||
log.Info("Compression Service - Complete archive run")
|
||||
jobs, err = jobRepo.FindJobsBetween(0, startTime)
|
||||
|
||||
} else {
|
||||
jobs, err = jobRepo.FindJobsBetween(lastTime, startTime)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for compression jobs: %v", err)
|
||||
}
|
||||
ar.Compress(jobs)
|
||||
})
|
||||
}
|
||||
|
||||
s.StartAsync()
|
||||
|
||||
if os.Getenv("GOGC") == "" {
|
||||
debug.SetGCPercent(25)
|
||||
}
|
||||
runtimeEnv.SystemdNotifiy(true, "running")
|
||||
wg.Wait()
|
||||
log.Print("Gracefull shutdown completed!")
|
||||
log.Print("Graceful shutdown completed!")
|
||||
}
|
||||
|
||||
318
cmd/cc-backend/server.go
Normal file
318
cmd/cc-backend/server.go
Normal file
@@ -0,0 +1,318 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql/handler"
|
||||
"github.com/99designs/gqlgen/graphql/playground"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
|
||||
"github.com/ClusterCockpit/cc-backend/web"
|
||||
"github.com/gorilla/handlers"
|
||||
"github.com/gorilla/mux"
|
||||
httpSwagger "github.com/swaggo/http-swagger"
|
||||
)
|
||||
|
||||
var (
|
||||
router *mux.Router
|
||||
server *http.Server
|
||||
apiHandle *api.RestApi
|
||||
)
|
||||
|
||||
func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
json.NewEncoder(rw).Encode(map[string]string{
|
||||
"status": http.StatusText(http.StatusUnauthorized),
|
||||
"error": err.Error(),
|
||||
})
|
||||
}
|
||||
|
||||
func serverInit() {
|
||||
// Setup the http.Handler/Router used by the server
|
||||
graph.Init()
|
||||
resolver := graph.GetResolverInstance()
|
||||
graphQLEndpoint := handler.NewDefaultServer(
|
||||
generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
|
||||
|
||||
if os.Getenv("DEBUG") != "1" {
|
||||
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
|
||||
// The problem with this is that then, no more stacktrace is printed to stderr.
|
||||
graphQLEndpoint.SetRecoverFunc(func(ctx context.Context, err interface{}) error {
|
||||
switch e := err.(type) {
|
||||
case string:
|
||||
return fmt.Errorf("MAIN > Panic: %s", e)
|
||||
case error:
|
||||
return fmt.Errorf("MAIN > Panic caused by: %w", e)
|
||||
}
|
||||
|
||||
return errors.New("MAIN > Internal server error (panic)")
|
||||
})
|
||||
}
|
||||
|
||||
authHandle := auth.GetAuthInstance()
|
||||
|
||||
apiHandle = api.New()
|
||||
|
||||
router = mux.NewRouter()
|
||||
buildInfo := web.Build{Version: version, Hash: commit, Buildtime: date}
|
||||
|
||||
info := map[string]interface{}{}
|
||||
info["hasOpenIDConnect"] = false
|
||||
|
||||
if config.Keys.OpenIDConfig != nil {
|
||||
openIDConnect := auth.NewOIDC(authHandle)
|
||||
openIDConnect.RegisterEndpoints(router)
|
||||
info["hasOpenIDConnect"] = true
|
||||
}
|
||||
|
||||
router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
log.Debugf("##%v##", info)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
|
||||
}).Methods(http.MethodGet)
|
||||
router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
|
||||
})
|
||||
router.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
|
||||
})
|
||||
|
||||
secured := router.PathPrefix("/").Subrouter()
|
||||
securedapi := router.PathPrefix("/api").Subrouter()
|
||||
userapi := router.PathPrefix("/userapi").Subrouter()
|
||||
configapi := router.PathPrefix("/config").Subrouter()
|
||||
frontendapi := router.PathPrefix("/frontend").Subrouter()
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
router.Handle("/login", authHandle.Login(
|
||||
// On success: Handled within Login()
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Login failed - ClusterCockpit",
|
||||
MsgType: "alert-warning",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
})).Methods(http.MethodPost)
|
||||
|
||||
router.Handle("/jwt-login", authHandle.Login(
|
||||
// On success: Handled within Login()
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Login failed - ClusterCockpit",
|
||||
MsgType: "alert-warning",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))
|
||||
|
||||
router.Handle("/logout", authHandle.Logout(
|
||||
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Bye - ClusterCockpit",
|
||||
MsgType: "alert-info",
|
||||
Message: "Logout successful",
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))).Methods(http.MethodPost)
|
||||
|
||||
secured.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.Auth(
|
||||
// On success;
|
||||
next,
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Authentication failed - ClusterCockpit",
|
||||
MsgType: "alert-danger",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
Redirect: r.RequestURI,
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
securedapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthApi(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
userapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthUserApi(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
configapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthConfigApi(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
frontendapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthFrontendApi(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
}
|
||||
|
||||
if flagDev {
|
||||
router.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
|
||||
router.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
|
||||
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
|
||||
}
|
||||
secured.Handle("/query", graphQLEndpoint)
|
||||
|
||||
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
|
||||
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
|
||||
routerConfig.HandleSearchBar(rw, r, buildInfo)
|
||||
})
|
||||
|
||||
// Mount all /monitoring/... and /api/... routes.
|
||||
routerConfig.SetupRoutes(secured, buildInfo)
|
||||
apiHandle.MountApiRoutes(securedapi)
|
||||
apiHandle.MountUserApiRoutes(userapi)
|
||||
apiHandle.MountConfigApiRoutes(configapi)
|
||||
apiHandle.MountFrontendApiRoutes(frontendapi)
|
||||
|
||||
if config.Keys.EmbedStaticFiles {
|
||||
if i, err := os.Stat("./var/img"); err == nil {
|
||||
if i.IsDir() {
|
||||
log.Info("Use local directory for static images")
|
||||
router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
|
||||
}
|
||||
}
|
||||
router.PathPrefix("/").Handler(web.ServeFiles())
|
||||
} else {
|
||||
router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
|
||||
}
|
||||
|
||||
router.Use(handlers.CompressHandler)
|
||||
router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
|
||||
router.Use(handlers.CORS(
|
||||
handlers.AllowCredentials(),
|
||||
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
|
||||
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
|
||||
handlers.AllowedOrigins([]string{"*"})))
|
||||
}
|
||||
|
||||
func serverStart() {
|
||||
handler := handlers.CustomLoggingHandler(io.Discard, router, func(_ io.Writer, params handlers.LogFormatterParams) {
|
||||
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
|
||||
log.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
} else {
|
||||
log.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
}
|
||||
})
|
||||
|
||||
server = &http.Server{
|
||||
ReadTimeout: 20 * time.Second,
|
||||
WriteTimeout: 20 * time.Second,
|
||||
Handler: handler,
|
||||
Addr: config.Keys.Addr,
|
||||
}
|
||||
|
||||
// Start http or https server
|
||||
listener, err := net.Listen("tcp", config.Keys.Addr)
|
||||
if err != nil {
|
||||
log.Fatalf("starting http listener failed: %v", err)
|
||||
}
|
||||
|
||||
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHttpTo != "" {
|
||||
go func() {
|
||||
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHttpTo, http.StatusMovedPermanently))
|
||||
}()
|
||||
}
|
||||
|
||||
if config.Keys.HttpsCertFile != "" && config.Keys.HttpsKeyFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(
|
||||
config.Keys.HttpsCertFile, config.Keys.HttpsKeyFile)
|
||||
if err != nil {
|
||||
log.Fatalf("loading X509 keypair failed: %v", err)
|
||||
}
|
||||
listener = tls.NewListener(listener, &tls.Config{
|
||||
Certificates: []tls.Certificate{cert},
|
||||
CipherSuites: []uint16{
|
||||
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
|
||||
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
|
||||
},
|
||||
MinVersion: tls.VersionTLS12,
|
||||
PreferServerCipherSuites: true,
|
||||
})
|
||||
fmt.Printf("HTTPS server listening at %s...", config.Keys.Addr)
|
||||
} else {
|
||||
fmt.Printf("HTTP server listening at %s...", config.Keys.Addr)
|
||||
}
|
||||
//
|
||||
// Because this program will want to bind to a privileged port (like 80), the listener must
|
||||
// be established first, then the user can be changed, and after that,
|
||||
// the actual http server can be started.
|
||||
if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
|
||||
log.Fatalf("error while preparing server start: %s", err.Error())
|
||||
}
|
||||
|
||||
if err = server.Serve(listener); err != nil && err != http.ErrServerClosed {
|
||||
log.Fatalf("starting server failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func serverShutdown() {
|
||||
// First shut down the server gracefully (waiting for all ongoing requests)
|
||||
server.Shutdown(context.Background())
|
||||
|
||||
// Then, wait for any async archivings still pending...
|
||||
archiver.WaitForArchiving()
|
||||
}
|
||||
@@ -1,56 +1,67 @@
|
||||
{
|
||||
"addr": "127.0.0.1:8080",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
"addr": "127.0.0.1:8080",
|
||||
"short-running-jobs-duration": 300,
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"enable-resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [
|
||||
600,
|
||||
300,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"emission-constant": 317,
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
69
configs/config-mariadb.json
Normal file
69
configs/config-mariadb.json
Normal file
@@ -0,0 +1,69 @@
|
||||
{
|
||||
"addr": "127.0.0.1:8080",
|
||||
"short-running-jobs-duration": 300,
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"db-driver": "mysql",
|
||||
"db": "clustercockpit:demo@tcp(127.0.0.1:3306)/clustercockpit",
|
||||
"enable-resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [
|
||||
600,
|
||||
300,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"emission-constant": 317,
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -117,10 +117,12 @@ foreach my $ln (split("\n", $topo)) {
|
||||
|
||||
my $node;
|
||||
my @sockets;
|
||||
my @nodeCores;
|
||||
foreach my $socket ( @{$DOMAINS{socket}} ) {
|
||||
push @sockets, "[".join(",", @{$socket})."]";
|
||||
$node .= join(",", @{$socket})
|
||||
push @nodeCores, join(",", @{$socket});
|
||||
}
|
||||
$node = join(",", @nodeCores);
|
||||
$INFO{sockets} = join(",\n", @sockets);
|
||||
|
||||
my @memDomains;
|
||||
@@ -212,9 +214,27 @@ print <<"END";
|
||||
"socketsPerNode": $INFO{socketsPerNode},
|
||||
"coresPerSocket": $INFO{coresPerSocket},
|
||||
"threadsPerCore": $INFO{threadsPerCore},
|
||||
"flopRateScalar": $flopsScalar,
|
||||
"flopRateSimd": $flopsSimd,
|
||||
"memoryBandwidth": $memBw,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": $flopsScalar
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": $flopsSimd
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": $memBw
|
||||
},
|
||||
"nodes": "<FILL IN NODE RANGES>",
|
||||
"topology": {
|
||||
"node": [$node],
|
||||
|
||||
106
go.mod
106
go.mod
@@ -1,91 +1,89 @@
|
||||
module github.com/ClusterCockpit/cc-backend
|
||||
|
||||
go 1.18
|
||||
go 1.23.5
|
||||
|
||||
require (
|
||||
github.com/99designs/gqlgen v0.17.45
|
||||
github.com/99designs/gqlgen v0.17.63
|
||||
github.com/ClusterCockpit/cc-units v0.4.0
|
||||
github.com/Masterminds/squirrel v1.5.3
|
||||
github.com/coreos/go-oidc/v3 v3.9.0
|
||||
github.com/go-co-op/gocron v1.25.0
|
||||
github.com/go-ldap/ldap/v3 v3.4.4
|
||||
github.com/go-sql-driver/mysql v1.7.0
|
||||
github.com/Masterminds/squirrel v1.5.4
|
||||
github.com/coreos/go-oidc/v3 v3.11.0
|
||||
github.com/go-co-op/gocron/v2 v2.9.0
|
||||
github.com/go-ldap/ldap/v3 v3.4.8
|
||||
github.com/go-sql-driver/mysql v1.8.1
|
||||
github.com/golang-jwt/jwt/v5 v5.2.1
|
||||
github.com/golang-migrate/migrate/v4 v4.15.2
|
||||
github.com/google/gops v0.3.27
|
||||
github.com/gorilla/handlers v1.5.1
|
||||
github.com/gorilla/mux v1.8.0
|
||||
github.com/gorilla/sessions v1.2.1
|
||||
github.com/influxdata/influxdb-client-go/v2 v2.12.2
|
||||
github.com/jmoiron/sqlx v1.3.5
|
||||
github.com/mattn/go-sqlite3 v1.14.16
|
||||
github.com/prometheus/client_golang v1.14.0
|
||||
github.com/prometheus/common v0.40.0
|
||||
github.com/golang-migrate/migrate/v4 v4.17.1
|
||||
github.com/google/gops v0.3.28
|
||||
github.com/gorilla/handlers v1.5.2
|
||||
github.com/gorilla/mux v1.8.1
|
||||
github.com/gorilla/sessions v1.4.0
|
||||
github.com/influxdata/influxdb-client-go/v2 v2.13.0
|
||||
github.com/jmoiron/sqlx v1.4.0
|
||||
github.com/mattn/go-sqlite3 v1.14.22
|
||||
github.com/prometheus/client_golang v1.19.1
|
||||
github.com/prometheus/common v0.55.0
|
||||
github.com/qustavo/sqlhooks/v2 v2.1.0
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.2.0
|
||||
github.com/swaggo/http-swagger v1.3.3
|
||||
github.com/swaggo/swag v1.16.3
|
||||
github.com/vektah/gqlparser/v2 v2.5.11
|
||||
golang.org/x/crypto v0.21.0
|
||||
golang.org/x/exp v0.0.0-20230510235704-dd950f8aeaea
|
||||
golang.org/x/oauth2 v0.13.0
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
|
||||
github.com/swaggo/http-swagger v1.3.4
|
||||
github.com/swaggo/swag v1.16.4
|
||||
github.com/vektah/gqlparser/v2 v2.5.22
|
||||
golang.org/x/crypto v0.32.0
|
||||
golang.org/x/exp v0.0.0-20240707233637-46b078467d37
|
||||
golang.org/x/oauth2 v0.21.0
|
||||
)
|
||||
|
||||
require (
|
||||
filippo.io/edwards25519 v1.1.0 // indirect
|
||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
|
||||
github.com/KyleBanks/depth v1.2.1 // indirect
|
||||
github.com/agnivade/levenshtein v1.1.1 // indirect
|
||||
github.com/agnivade/levenshtein v1.2.1 // indirect
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.2.0 // indirect
|
||||
github.com/containerd/containerd v1.6.26 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
|
||||
github.com/deepmap/oapi-codegen v1.12.4 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.3 // indirect
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.4 // indirect
|
||||
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect
|
||||
github.com/go-jose/go-jose/v4 v4.0.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.21.0 // indirect
|
||||
github.com/go-openapi/jsonreference v0.21.0 // indirect
|
||||
github.com/go-openapi/spec v0.21.0 // indirect
|
||||
github.com/go-openapi/swag v0.23.0 // indirect
|
||||
github.com/golang/protobuf v1.5.3 // indirect
|
||||
github.com/go-viper/mapstructure/v2 v2.2.1 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/gorilla/securecookie v1.1.1 // indirect
|
||||
github.com/gorilla/websocket v1.5.0 // indirect
|
||||
github.com/gorilla/securecookie v1.1.2 // indirect
|
||||
github.com/gorilla/websocket v1.5.3 // indirect
|
||||
github.com/hashicorp/errwrap v1.1.0 // indirect
|
||||
github.com/hashicorp/go-multierror v1.1.1 // indirect
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
|
||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
|
||||
github.com/jonboulle/clockwork v0.4.0 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/jpillora/backoff v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
|
||||
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/mailru/easyjson v0.9.0 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
|
||||
github.com/oapi-codegen/runtime v1.1.1 // indirect
|
||||
github.com/opencontainers/image-spec v1.1.0-rc2.0.20221005185240-3a7f492d3f1b // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/prometheus/client_model v0.3.0 // indirect
|
||||
github.com/prometheus/procfs v0.9.0 // indirect
|
||||
github.com/prometheus/client_model v0.6.1 // indirect
|
||||
github.com/prometheus/procfs v0.15.1 // indirect
|
||||
github.com/robfig/cron/v3 v3.0.1 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/sosodev/duration v1.2.0 // indirect
|
||||
github.com/swaggo/files v1.0.0 // indirect
|
||||
github.com/urfave/cli/v2 v2.27.1 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913 // indirect
|
||||
go.uber.org/atomic v1.10.0 // indirect
|
||||
golang.org/x/mod v0.16.0 // indirect
|
||||
golang.org/x/net v0.22.0 // indirect
|
||||
golang.org/x/sys v0.18.0 // indirect
|
||||
golang.org/x/text v0.14.0 // indirect
|
||||
golang.org/x/tools v0.19.0 // indirect
|
||||
google.golang.org/appengine v1.6.8 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 // indirect
|
||||
google.golang.org/protobuf v1.33.0 // indirect
|
||||
github.com/sosodev/duration v1.3.1 // indirect
|
||||
github.com/swaggo/files v1.0.1 // indirect
|
||||
github.com/urfave/cli/v2 v2.27.5 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
|
||||
go.uber.org/atomic v1.11.0 // indirect
|
||||
golang.org/x/mod v0.22.0 // indirect
|
||||
golang.org/x/net v0.34.0 // indirect
|
||||
golang.org/x/sync v0.10.0 // indirect
|
||||
golang.org/x/sys v0.29.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
golang.org/x/tools v0.29.0 // indirect
|
||||
google.golang.org/protobuf v1.36.1 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
sigs.k8s.io/yaml v1.4.0 // indirect
|
||||
|
||||
61
gqlgen.yml
61
gqlgen.yml
@@ -61,23 +61,50 @@ models:
|
||||
fields:
|
||||
partitions:
|
||||
resolver: true
|
||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
|
||||
MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
|
||||
MetricValue: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
|
||||
JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
|
||||
NullableFloat:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
|
||||
MetricScope:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
|
||||
MetricValue:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
|
||||
JobStatistics:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
|
||||
GlobalMetricListItem:
|
||||
{
|
||||
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.GlobalMetricListItem",
|
||||
}
|
||||
ClusterSupport:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.ClusterSupport" }
|
||||
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
|
||||
Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
|
||||
JobState: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
|
||||
TimeRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
|
||||
IntRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.IntRange" }
|
||||
JobMetric: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobMetric" }
|
||||
Resource:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
|
||||
JobState:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
|
||||
TimeRange:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
|
||||
IntRange:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.IntRange" }
|
||||
JobMetric:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobMetric" }
|
||||
Series: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Series" }
|
||||
MetricStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricStatistics" }
|
||||
MetricConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricConfig" }
|
||||
SubClusterConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubClusterConfig" }
|
||||
Accelerator: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Accelerator" }
|
||||
Topology: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Topology" }
|
||||
FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
|
||||
SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
|
||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
|
||||
MetricStatistics:
|
||||
{
|
||||
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricStatistics",
|
||||
}
|
||||
MetricConfig:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricConfig" }
|
||||
SubClusterConfig:
|
||||
{
|
||||
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubClusterConfig",
|
||||
}
|
||||
Accelerator:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Accelerator" }
|
||||
Topology:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Topology" }
|
||||
FilterRanges:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
|
||||
SubCluster:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
|
||||
StatsSeries:
|
||||
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
|
||||
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[Unit]
|
||||
Description=ClusterCockpit Web Server (Go edition)
|
||||
Description=ClusterCockpit Web Server
|
||||
Documentation=https://github.com/ClusterCockpit/cc-backend
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
@@ -14,13 +14,16 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
@@ -117,7 +120,7 @@ func setup(t *testing.T) *api.RestApi {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -144,23 +147,20 @@ func setup(t *testing.T) *api.RestApi {
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
repository.Connect("sqlite3", dbfilepath)
|
||||
db := repository.GetConnection()
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := metricdata.Init(config.Keys.DisableArchive); err != nil {
|
||||
if err := metricdata.Init(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
jobRepo := repository.GetJobRepository()
|
||||
resolver := &graph.Resolver{DB: db.DB, Repo: jobRepo}
|
||||
archiver.Start(repository.GetJobRepository())
|
||||
auth.Init()
|
||||
graph.Init()
|
||||
|
||||
return &api.RestApi{
|
||||
JobRepository: resolver.Repo,
|
||||
Resolver: resolver,
|
||||
}
|
||||
return api.New()
|
||||
}
|
||||
|
||||
func cleanup() {
|
||||
@@ -175,7 +175,6 @@ func cleanup() {
|
||||
func TestRestApi(t *testing.T) {
|
||||
restapi := setup(t)
|
||||
t.Cleanup(cleanup)
|
||||
|
||||
testData := schema.JobData{
|
||||
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||
schema.MetricScopeNode: {
|
||||
@@ -192,12 +191,18 @@ func TestRestApi(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
return testData, nil
|
||||
}
|
||||
|
||||
r := mux.NewRouter()
|
||||
restapi.MountRoutes(r)
|
||||
r.PathPrefix("/api").Subrouter()
|
||||
r.StrictSlash(true)
|
||||
restapi.MountApiRoutes(r)
|
||||
|
||||
var TestJobId int64 = 123
|
||||
var TestClusterName string = "testcluster"
|
||||
var TestStartTime int64 = 123456789
|
||||
|
||||
const startJobBody string = `{
|
||||
"jobId": 123,
|
||||
@@ -213,7 +218,7 @@ func TestRestApi(t *testing.T) {
|
||||
"exclusive": 1,
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"tags": [{ "type": "testTagType", "name": "testTagName" }],
|
||||
"tags": [{ "type": "testTagType", "name": "testTagName", "scope": "testuser" }],
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
@@ -224,28 +229,33 @@ func TestRestApi(t *testing.T) {
|
||||
"startTime": 123456789
|
||||
}`
|
||||
|
||||
var dbid int64
|
||||
const contextUserKey repository.ContextKey = "user"
|
||||
contextUserValue := &schema.User{
|
||||
Username: "testuser",
|
||||
Projects: make([]string, 0),
|
||||
Roles: []string{"user"},
|
||||
AuthType: 0,
|
||||
AuthSource: 2,
|
||||
}
|
||||
|
||||
if ok := t.Run("StartJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusCreated {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
var res api.StartJobApiResponse
|
||||
if err := json.Unmarshal(recorder.Body.Bytes(), &res); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(res.DBID)))
|
||||
resolver := graph.GetResolverInstance()
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
job.Tags, err = restapi.Resolver.Job().Tags(context.Background(), job)
|
||||
job.Tags, err = resolver.Job().Tags(ctx, job)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -269,11 +279,9 @@ func TestRestApi(t *testing.T) {
|
||||
t.Fatalf("unexpected job properties: %#v", job)
|
||||
}
|
||||
|
||||
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" {
|
||||
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" {
|
||||
t.Fatalf("unexpected tags: %#v", job.Tags)
|
||||
}
|
||||
|
||||
dbid = res.DBID
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
@@ -289,17 +297,19 @@ func TestRestApi(t *testing.T) {
|
||||
|
||||
var stoppedJob *schema.Job
|
||||
if ok := t.Run("StopJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusOK {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
restapi.JobRepository.WaitForArchiving()
|
||||
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(dbid)))
|
||||
archiver.WaitForArchiving()
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -327,7 +337,7 @@ func TestRestApi(t *testing.T) {
|
||||
}
|
||||
|
||||
t.Run("CheckArchive", func(t *testing.T) {
|
||||
data, err := metricdata.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background())
|
||||
data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -341,10 +351,12 @@ func TestRestApi(t *testing.T) {
|
||||
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!
|
||||
body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1)
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(body)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusUnprocessableEntity {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
@@ -371,10 +383,12 @@ func TestRestApi(t *testing.T) {
|
||||
}`
|
||||
|
||||
ok := t.Run("StartJobFailed", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusCreated {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
@@ -384,8 +398,10 @@ func TestRestApi(t *testing.T) {
|
||||
t.Fatal("subtest failed")
|
||||
}
|
||||
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
const stopJobBodyFailed string = `{
|
||||
"jobId": 12345,
|
||||
"jobId": 12345,
|
||||
"cluster": "testcluster",
|
||||
|
||||
"jobState": "failed",
|
||||
@@ -393,16 +409,18 @@ func TestRestApi(t *testing.T) {
|
||||
}`
|
||||
|
||||
ok = t.Run("StopJobFailed", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(recorder, req)
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusOK {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
restapi.JobRepository.WaitForArchiving()
|
||||
archiver.WaitForArchiving()
|
||||
jobid, cluster := int64(12345), "testcluster"
|
||||
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
|
||||
if err != nil {
|
||||
|
||||
@@ -208,7 +208,7 @@ const docTemplate = `{
|
||||
"200": {
|
||||
"description": "Success message",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.DeleteJobApiResponse"
|
||||
"$ref": "#/definitions/api.DefaultJobApiResponse"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
@@ -278,7 +278,7 @@ const docTemplate = `{
|
||||
"200": {
|
||||
"description": "Success message",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.DeleteJobApiResponse"
|
||||
"$ref": "#/definitions/api.DefaultJobApiResponse"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
@@ -348,7 +348,7 @@ const docTemplate = `{
|
||||
"200": {
|
||||
"description": "Success message",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.DeleteJobApiResponse"
|
||||
"$ref": "#/definitions/api.DefaultJobApiResponse"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
@@ -493,7 +493,7 @@ const docTemplate = `{
|
||||
"201": {
|
||||
"description": "Job added successfully",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.StartJobApiResponse"
|
||||
"$ref": "#/definitions/api.DefaultJobApiResponse"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
@@ -587,89 +587,7 @@ const docTemplate = `{
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Unprocessable Entity: finding job failed: sql: no rows in result set",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/jobs/stop_job/{id}": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Job to stop is specified by database ID. Only stopTime and final state are required in request body.\nReturns full job resource information according to 'JobMeta' scheme.",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
"tags": [
|
||||
"Job add and modify"
|
||||
],
|
||||
"summary": "Marks job as completed and triggers archiving",
|
||||
"parameters": [
|
||||
{
|
||||
"type": "integer",
|
||||
"description": "Database ID of Job",
|
||||
"name": "id",
|
||||
"in": "path",
|
||||
"required": true
|
||||
},
|
||||
{
|
||||
"description": "stopTime and final state in request body",
|
||||
"name": "request",
|
||||
"in": "body",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.StopJobApiRequest"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Job resource",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/schema.JobMeta"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Bad Request",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"401": {
|
||||
"description": "Unauthorized",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"403": {
|
||||
"description": "Forbidden",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"404": {
|
||||
"description": "Resource not found",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Unprocessable Entity: finding job failed: sql: no rows in result set",
|
||||
"description": "Unprocessable Entity: job has already been stopped",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
@@ -690,7 +608,7 @@ const docTemplate = `{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nIf tagged job is already finished: Tag will be written directly to respective archive files.",
|
||||
"description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
@@ -915,6 +833,72 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"/notice/": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Modifies the content of notice.txt, shown as notice box on the homepage.\nIf more than one formValue is set then only the highest priority field is used.\nOnly accessible from IPs registered with apiAllowedIPs configuration option.",
|
||||
"consumes": [
|
||||
"multipart/form-data"
|
||||
],
|
||||
"produces": [
|
||||
"text/plain"
|
||||
],
|
||||
"tags": [
|
||||
"User"
|
||||
],
|
||||
"summary": "Updates or empties the notice box content",
|
||||
"parameters": [
|
||||
{
|
||||
"type": "string",
|
||||
"description": "Priority 1: New content to display",
|
||||
"name": "new-content",
|
||||
"in": "formData"
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Success Response Message",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Bad Request",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"401": {
|
||||
"description": "Unauthorized",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"403": {
|
||||
"description": "Forbidden",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Unprocessable Entity: The user could not be updated",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/user/{id}": {
|
||||
"post": {
|
||||
"security": [
|
||||
@@ -1283,6 +1267,11 @@ const docTemplate = `{
|
||||
"type": "string",
|
||||
"example": "Testjob"
|
||||
},
|
||||
"scope": {
|
||||
"description": "Tag Scope for Frontend Display",
|
||||
"type": "string",
|
||||
"example": "global"
|
||||
},
|
||||
"type": {
|
||||
"description": "Tag Type",
|
||||
"type": "string",
|
||||
@@ -1290,6 +1279,14 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.DefaultJobApiResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"msg": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.DeleteJobApiRequest": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
@@ -1313,14 +1310,6 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.DeleteJobApiResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"msg": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.EditMetaRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -1407,15 +1396,6 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.StartJobApiResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"description": "Database ID of new job",
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"api.StopJobApiRequest": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
@@ -1424,17 +1404,14 @@ const docTemplate = `{
|
||||
],
|
||||
"properties": {
|
||||
"cluster": {
|
||||
"description": "Cluster of job",
|
||||
"type": "string",
|
||||
"example": "fritz"
|
||||
},
|
||||
"jobId": {
|
||||
"description": "Cluster Job ID of job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"jobState": {
|
||||
"description": "Final job state",
|
||||
"allOf": [
|
||||
{
|
||||
"$ref": "#/definitions/schema.JobState"
|
||||
@@ -1443,12 +1420,10 @@ const docTemplate = `{
|
||||
"example": "completed"
|
||||
},
|
||||
"startTime": {
|
||||
"description": "Start Time of job as epoch",
|
||||
"type": "integer",
|
||||
"example": 1649723812
|
||||
},
|
||||
"stopTime": {
|
||||
"description": "Stop Time of job as epoch",
|
||||
"type": "integer",
|
||||
"example": 1649763839
|
||||
}
|
||||
@@ -1493,12 +1468,10 @@ const docTemplate = `{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"arrayJobId": {
|
||||
"description": "The unique identifier of an array job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"cluster": {
|
||||
"description": "The unique identifier of a cluster",
|
||||
"type": "string",
|
||||
"example": "fritz"
|
||||
},
|
||||
@@ -1506,33 +1479,39 @@ const docTemplate = `{
|
||||
"$ref": "#/definitions/schema.JobLinkResultList"
|
||||
},
|
||||
"duration": {
|
||||
"description": "Duration of job in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 43200
|
||||
},
|
||||
"energy": {
|
||||
"type": "number"
|
||||
},
|
||||
"energyFootprint": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"exclusive": {
|
||||
"description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user",
|
||||
"type": "integer",
|
||||
"maximum": 2,
|
||||
"minimum": 0,
|
||||
"example": 1
|
||||
},
|
||||
"flopsAnyAvg": {
|
||||
"description": "FlopsAnyAvg as Float64",
|
||||
"type": "number"
|
||||
"footprint": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"id": {
|
||||
"description": "The unique identifier of a job in the database",
|
||||
"type": "integer"
|
||||
},
|
||||
"jobId": {
|
||||
"description": "The unique identifier of a job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"jobState": {
|
||||
"description": "Final state of job",
|
||||
"enum": [
|
||||
"completed",
|
||||
"failed",
|
||||
@@ -1548,95 +1527,69 @@ const docTemplate = `{
|
||||
],
|
||||
"example": "completed"
|
||||
},
|
||||
"loadAvg": {
|
||||
"description": "LoadAvg as Float64",
|
||||
"type": "number"
|
||||
},
|
||||
"memBwAvg": {
|
||||
"description": "MemBwAvg as Float64",
|
||||
"type": "number"
|
||||
},
|
||||
"memUsedMax": {
|
||||
"description": "MemUsedMax as Float64",
|
||||
"type": "number"
|
||||
},
|
||||
"metaData": {
|
||||
"description": "Additional information about the job",
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"monitoringStatus": {
|
||||
"description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull",
|
||||
"type": "integer",
|
||||
"maximum": 3,
|
||||
"minimum": 0,
|
||||
"example": 1
|
||||
},
|
||||
"numAcc": {
|
||||
"description": "Number of accelerators used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 2
|
||||
},
|
||||
"numHwthreads": {
|
||||
"description": "NumCores int32 ` + "`" + `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` + "`" + ` // Number of HWThreads used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 20
|
||||
},
|
||||
"numNodes": {
|
||||
"description": "Number of nodes used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 2
|
||||
},
|
||||
"partition": {
|
||||
"description": "The Slurm partition to which the job was submitted",
|
||||
"type": "string",
|
||||
"example": "main"
|
||||
},
|
||||
"project": {
|
||||
"description": "The unique identifier of a project",
|
||||
"type": "string",
|
||||
"example": "abcd200"
|
||||
},
|
||||
"resources": {
|
||||
"description": "Resources used by job",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.Resource"
|
||||
}
|
||||
},
|
||||
"smt": {
|
||||
"description": "SMT threads used by job",
|
||||
"type": "integer",
|
||||
"example": 4
|
||||
},
|
||||
"startTime": {
|
||||
"description": "Start time as 'time.Time' data type",
|
||||
"type": "string"
|
||||
},
|
||||
"subCluster": {
|
||||
"description": "The unique identifier of a sub cluster",
|
||||
"type": "string",
|
||||
"example": "main"
|
||||
},
|
||||
"tags": {
|
||||
"description": "List of tags",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.Tag"
|
||||
}
|
||||
},
|
||||
"user": {
|
||||
"description": "The unique identifier of a user",
|
||||
"type": "string",
|
||||
"example": "abcd100h"
|
||||
},
|
||||
"walltime": {
|
||||
"description": "Requested walltime of job in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 86400
|
||||
@@ -1673,12 +1626,10 @@ const docTemplate = `{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"arrayJobId": {
|
||||
"description": "The unique identifier of an array job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"cluster": {
|
||||
"description": "The unique identifier of a cluster",
|
||||
"type": "string",
|
||||
"example": "fritz"
|
||||
},
|
||||
@@ -1686,29 +1637,39 @@ const docTemplate = `{
|
||||
"$ref": "#/definitions/schema.JobLinkResultList"
|
||||
},
|
||||
"duration": {
|
||||
"description": "Duration of job in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 43200
|
||||
},
|
||||
"energy": {
|
||||
"type": "number"
|
||||
},
|
||||
"energyFootprint": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"exclusive": {
|
||||
"description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user",
|
||||
"type": "integer",
|
||||
"maximum": 2,
|
||||
"minimum": 0,
|
||||
"example": 1
|
||||
},
|
||||
"footprint": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"id": {
|
||||
"description": "The unique identifier of a job in the database",
|
||||
"type": "integer"
|
||||
},
|
||||
"jobId": {
|
||||
"description": "The unique identifier of a job",
|
||||
"type": "integer",
|
||||
"example": 123000
|
||||
},
|
||||
"jobState": {
|
||||
"description": "Final state of job",
|
||||
"enum": [
|
||||
"completed",
|
||||
"failed",
|
||||
@@ -1725,91 +1686,76 @@ const docTemplate = `{
|
||||
"example": "completed"
|
||||
},
|
||||
"metaData": {
|
||||
"description": "Additional information about the job",
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"monitoringStatus": {
|
||||
"description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull",
|
||||
"type": "integer",
|
||||
"maximum": 3,
|
||||
"minimum": 0,
|
||||
"example": 1
|
||||
},
|
||||
"numAcc": {
|
||||
"description": "Number of accelerators used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 2
|
||||
},
|
||||
"numHwthreads": {
|
||||
"description": "NumCores int32 ` + "`" + `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` + "`" + ` // Number of HWThreads used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 20
|
||||
},
|
||||
"numNodes": {
|
||||
"description": "Number of nodes used (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 2
|
||||
},
|
||||
"partition": {
|
||||
"description": "The Slurm partition to which the job was submitted",
|
||||
"type": "string",
|
||||
"example": "main"
|
||||
},
|
||||
"project": {
|
||||
"description": "The unique identifier of a project",
|
||||
"type": "string",
|
||||
"example": "abcd200"
|
||||
},
|
||||
"resources": {
|
||||
"description": "Resources used by job",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.Resource"
|
||||
}
|
||||
},
|
||||
"smt": {
|
||||
"description": "SMT threads used by job",
|
||||
"type": "integer",
|
||||
"example": 4
|
||||
},
|
||||
"startTime": {
|
||||
"description": "Start epoch time stamp in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 1649723812
|
||||
},
|
||||
"statistics": {
|
||||
"description": "Metric statistics of job",
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/definitions/schema.JobStatistics"
|
||||
}
|
||||
},
|
||||
"subCluster": {
|
||||
"description": "The unique identifier of a sub cluster",
|
||||
"type": "string",
|
||||
"example": "main"
|
||||
},
|
||||
"tags": {
|
||||
"description": "List of tags",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.Tag"
|
||||
}
|
||||
},
|
||||
"user": {
|
||||
"description": "The unique identifier of a user",
|
||||
"type": "string",
|
||||
"example": "abcd100h"
|
||||
},
|
||||
"walltime": {
|
||||
"description": "Requested walltime of job in seconds (Min \u003e 0)",
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"example": 86400
|
||||
@@ -1898,6 +1844,15 @@ const docTemplate = `{
|
||||
"caution": {
|
||||
"type": "number"
|
||||
},
|
||||
"energy": {
|
||||
"type": "string"
|
||||
},
|
||||
"footprint": {
|
||||
"type": "string"
|
||||
},
|
||||
"lowerIsBetter": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -1975,22 +1930,18 @@ const docTemplate = `{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"accelerators": {
|
||||
"description": "List of of accelerator device ids",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"configuration": {
|
||||
"description": "The configuration options of the node",
|
||||
"type": "string"
|
||||
},
|
||||
"hostname": {
|
||||
"description": "Name of the host (= node)",
|
||||
"type": "string"
|
||||
},
|
||||
"hwthreads": {
|
||||
"description": "List of OS processor ids",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
@@ -2033,6 +1984,12 @@ const docTemplate = `{
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"median": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"min": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
@@ -2056,15 +2013,33 @@ const docTemplate = `{
|
||||
"coresPerSocket": {
|
||||
"type": "integer"
|
||||
},
|
||||
"energyFootprint": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"flopRateScalar": {
|
||||
"$ref": "#/definitions/schema.MetricValue"
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"$ref": "#/definitions/schema.MetricValue"
|
||||
},
|
||||
"footprint": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"$ref": "#/definitions/schema.MetricValue"
|
||||
},
|
||||
"metricConfig": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.MetricConfig"
|
||||
}
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -2094,6 +2069,15 @@ const docTemplate = `{
|
||||
"caution": {
|
||||
"type": "number"
|
||||
},
|
||||
"energy": {
|
||||
"type": "string"
|
||||
},
|
||||
"footprint": {
|
||||
"type": "string"
|
||||
},
|
||||
"lowerIsBetter": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -2113,16 +2097,17 @@ const docTemplate = `{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"description": "The unique DB identifier of a tag",
|
||||
"type": "integer"
|
||||
},
|
||||
"name": {
|
||||
"description": "Tag Name",
|
||||
"type": "string",
|
||||
"example": "Testjob"
|
||||
},
|
||||
"scope": {
|
||||
"type": "string",
|
||||
"example": "global"
|
||||
},
|
||||
"type": {
|
||||
"description": "Tag Type",
|
||||
"type": "string",
|
||||
"example": "Debug"
|
||||
}
|
||||
|
||||
@@ -19,12 +19,13 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/util"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
@@ -53,19 +54,24 @@ import (
|
||||
|
||||
type RestApi struct {
|
||||
JobRepository *repository.JobRepository
|
||||
Resolver *graph.Resolver
|
||||
Authentication *auth.Authentication
|
||||
MachineStateDir string
|
||||
RepositoryMutex sync.Mutex
|
||||
}
|
||||
|
||||
func (api *RestApi) MountRoutes(r *mux.Router) {
|
||||
r = r.PathPrefix("/api").Subrouter()
|
||||
func New() *RestApi {
|
||||
return &RestApi{
|
||||
JobRepository: repository.GetJobRepository(),
|
||||
MachineStateDir: config.Keys.MachineStateDir,
|
||||
Authentication: auth.GetAuthInstance(),
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestApi) MountApiRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
|
||||
r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/jobs/stop_job/{id}", api.stopJobById).Methods(http.MethodPost, http.MethodPut)
|
||||
// r.HandleFunc("/jobs/import/", api.importJob).Methods(http.MethodPost, http.MethodPut)
|
||||
|
||||
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
|
||||
@@ -84,31 +90,41 @@ func (api *RestApi) MountRoutes(r *mux.Router) {
|
||||
r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet)
|
||||
r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestApi) MountUserApiRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
|
||||
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/{id}", api.getJobById).Methods(http.MethodPost)
|
||||
r.HandleFunc("/jobs/{id}", api.getCompleteJobById).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
|
||||
}
|
||||
|
||||
func (api *RestApi) MountConfigApiRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
|
||||
if api.Authentication != nil {
|
||||
r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet)
|
||||
r.HandleFunc("/roles/", api.getRoles).Methods(http.MethodGet)
|
||||
r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
|
||||
r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost)
|
||||
r.HandleFunc("/notice/", api.editNotice).Methods(http.MethodPost)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestApi) MountFrontendApiRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
|
||||
if api.Authentication != nil {
|
||||
r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet)
|
||||
r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost)
|
||||
}
|
||||
}
|
||||
|
||||
// StartJobApiResponse model
|
||||
type StartJobApiResponse struct {
|
||||
// Database ID of new job
|
||||
DBID int64 `json:"id"`
|
||||
}
|
||||
|
||||
// DeleteJobApiResponse model
|
||||
type DeleteJobApiResponse struct {
|
||||
Message string `json:"msg"`
|
||||
}
|
||||
|
||||
// UpdateUserApiResponse model
|
||||
type UpdateUserApiResponse struct {
|
||||
// DefaultApiResponse model
|
||||
type DefaultJobApiResponse struct {
|
||||
Message string `json:"msg"`
|
||||
}
|
||||
|
||||
@@ -150,8 +166,9 @@ type ErrorResponse struct {
|
||||
// ApiTag model
|
||||
type ApiTag struct {
|
||||
// Tag Type
|
||||
Type string `json:"type" example:"Debug"`
|
||||
Name string `json:"name" example:"Testjob"` // Tag Name
|
||||
Type string `json:"type" example:"Debug"`
|
||||
Name string `json:"name" example:"Testjob"` // Tag Name
|
||||
Scope string `json:"scope" example:"global"` // Tag Scope for Frontend Display
|
||||
}
|
||||
|
||||
// ApiMeta model
|
||||
@@ -311,17 +328,10 @@ func (api *RestApi) getClusters(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/ [get]
|
||||
func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
withMetadata := false
|
||||
filter := &model.JobFilter{}
|
||||
page := &model.PageRequest{ItemsPerPage: 25, Page: 1}
|
||||
order := &model.OrderByInput{Field: "startTime", Order: model.SortDirectionEnumDesc}
|
||||
order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc}
|
||||
|
||||
for key, vals := range r.URL.Query() {
|
||||
switch key {
|
||||
@@ -400,7 +410,7 @@ func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
StartTime: job.StartTime.Unix(),
|
||||
}
|
||||
|
||||
res.Tags, err = api.JobRepository.GetTags(&job.ID)
|
||||
res.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
@@ -434,7 +444,7 @@ func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// getJobById godoc
|
||||
// getCompleteJobById godoc
|
||||
// @summary Get job meta and optional all metric data
|
||||
// @tags Job query
|
||||
// @description Job to get is specified by database ID
|
||||
@@ -452,14 +462,6 @@ func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/{id} [get]
|
||||
func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v",
|
||||
schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch job from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
var job *schema.Job
|
||||
@@ -471,17 +473,17 @@ func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(id)
|
||||
job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags, err = api.JobRepository.GetTags(&job.ID)
|
||||
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
@@ -503,10 +505,17 @@ func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request)
|
||||
|
||||
var data schema.JobData
|
||||
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
resolution := 0
|
||||
|
||||
for _, mc := range metricConfigs {
|
||||
resolution = max(resolution, mc.Timestep)
|
||||
}
|
||||
|
||||
if r.URL.Query().Get("all-metrics") == "true" {
|
||||
data, err = metricdata.LoadData(job, nil, scopes, r.Context())
|
||||
data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading job data")
|
||||
log.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster)
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -546,14 +555,6 @@ func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request)
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/{id} [post]
|
||||
func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v",
|
||||
schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch job from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
var job *schema.Job
|
||||
@@ -565,17 +566,17 @@ func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(id)
|
||||
job, err = api.JobRepository.FindById(r.Context(), id)
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags, err = api.JobRepository.GetTags(&job.ID)
|
||||
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID)
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
@@ -601,9 +602,16 @@ func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
scopes = []schema.MetricScope{"node"}
|
||||
}
|
||||
|
||||
data, err := metricdata.LoadData(job, metrics, scopes, r.Context())
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
resolution := 0
|
||||
|
||||
for _, mc := range metricConfigs {
|
||||
resolution = max(resolution, mc.Timestep)
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading job data")
|
||||
log.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -651,19 +659,13 @@ func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/edit_meta/{id} [post]
|
||||
func (api *RestApi) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
iid, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(iid)
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusNotFound)
|
||||
return
|
||||
@@ -689,6 +691,7 @@ func (api *RestApi) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
// @summary Adds one or more tags to a job
|
||||
// @tags Job add and modify
|
||||
// @description Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.
|
||||
// @description Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username.
|
||||
// @description If tagged job is already finished: Tag will be written directly to respective archive files.
|
||||
// @accept json
|
||||
// @produce json
|
||||
@@ -702,26 +705,19 @@ func (api *RestApi) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/tag_job/{id} [post]
|
||||
func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
iid, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(iid)
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags, err = api.JobRepository.GetTags(&job.ID)
|
||||
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
@@ -734,16 +730,17 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
for _, tag := range req {
|
||||
tagId, err := api.JobRepository.AddTagOrCreate(job.ID, tag.Type, tag.Name)
|
||||
tagId, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), job.ID, tag.Type, tag.Name, tag.Scope)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags = append(job.Tags, &schema.Tag{
|
||||
ID: tagId,
|
||||
Type: tag.Type,
|
||||
Name: tag.Name,
|
||||
ID: tagId,
|
||||
Type: tag.Type,
|
||||
Name: tag.Name,
|
||||
Scope: tag.Scope,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -760,7 +757,7 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body schema.JobMeta true "Job to add"
|
||||
// @success 201 {object} api.StartJobApiResponse "Job added successfully"
|
||||
// @success 201 {object} api.DefaultJobApiResponse "Job added successfully"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -769,22 +766,14 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/start_job/ [post]
|
||||
func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
req := schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if req.State == "" {
|
||||
req.State = schema.JobStateRunning
|
||||
}
|
||||
req.State = schema.JobStateRunning
|
||||
|
||||
if err := importer.SanityChecks(&req.BaseJob); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
@@ -818,7 +807,7 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||
|
||||
for _, tag := range req.Tags {
|
||||
if _, err := api.JobRepository.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
|
||||
if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
@@ -828,68 +817,11 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusCreated)
|
||||
json.NewEncoder(rw).Encode(StartJobApiResponse{
|
||||
DBID: id,
|
||||
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
|
||||
Message: "success",
|
||||
})
|
||||
}
|
||||
|
||||
// stopJobById godoc
|
||||
// @summary Marks job as completed and triggers archiving
|
||||
// @tags Job add and modify
|
||||
// @description Job to stop is specified by database ID. Only stopTime and final state are required in request body.
|
||||
// @description Returns full job resource information according to 'JobMeta' scheme.
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @param request body api.StopJobApiRequest true "stopTime and final state in request body"
|
||||
// @success 200 {object} schema.JobMeta "Job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} api.ErrorResponse "Resource not found"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/stop_job/{id} [post]
|
||||
func (api *RestApi) stopJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Parse request body: Only StopTime and State
|
||||
req := StopJobApiRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if ok {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(id)
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
api.checkAndHandleStopJob(rw, job, req)
|
||||
}
|
||||
|
||||
// stopJobByRequest godoc
|
||||
// @summary Marks job as completed and triggers archiving
|
||||
// @tags Job add and modify
|
||||
@@ -902,18 +834,11 @@ func (api *RestApi) stopJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} api.ErrorResponse "Resource not found"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
|
||||
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: job has already been stopped"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/stop_job/ [post]
|
||||
func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Parse request body
|
||||
req := StopJobApiRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
@@ -929,8 +854,8 @@ func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
// log.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req)
|
||||
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
|
||||
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
@@ -945,7 +870,7 @@ func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// @description Job to remove is specified by database ID. This will not remove the job from the job archive.
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @success 200 {object} api.DeleteJobApiResponse "Success message"
|
||||
// @success 200 {object} api.DefaultJobApiResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -955,11 +880,6 @@ func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/delete_job/{id} [delete]
|
||||
func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil && !user.HasRole(schema.RoleApi) {
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
var err error
|
||||
@@ -981,7 +901,7 @@ func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(DeleteJobApiResponse{
|
||||
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
|
||||
Message: fmt.Sprintf("Successfully deleted job %s", id),
|
||||
})
|
||||
}
|
||||
@@ -993,7 +913,7 @@ func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body api.DeleteJobApiRequest true "All fields required"
|
||||
// @success 200 {object} api.DeleteJobApiResponse "Success message"
|
||||
// @success 200 {object} api.DefaultJobApiResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -1003,12 +923,6 @@ func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/delete_job/ [delete]
|
||||
func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Parse request body
|
||||
req := DeleteJobApiRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
@@ -1025,7 +939,6 @@ func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
|
||||
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
@@ -1039,7 +952,7 @@ func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(DeleteJobApiResponse{
|
||||
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
|
||||
Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
|
||||
})
|
||||
}
|
||||
@@ -1050,7 +963,7 @@ func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.
|
||||
// @produce json
|
||||
// @param ts path int true "Unix epoch timestamp"
|
||||
// @success 200 {object} api.DeleteJobApiResponse "Success message"
|
||||
// @success 200 {object} api.DefaultJobApiResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -1060,11 +973,6 @@ func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/delete_job_before/{ts} [delete]
|
||||
func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil && !user.HasRole(schema.RoleApi) {
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
var cnt int
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["ts"]
|
||||
@@ -1088,20 +996,25 @@ func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(DeleteJobApiResponse{
|
||||
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
|
||||
Message: fmt.Sprintf("Successfully deleted %d jobs", cnt),
|
||||
})
|
||||
}
|
||||
|
||||
func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobApiRequest) {
|
||||
// Sanity checks
|
||||
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
|
||||
handleError(errors.New("stopTime must be larger than startTime and only running jobs can be stopped"), http.StatusBadRequest, rw)
|
||||
if job.State != schema.JobStateRunning {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if job == nil || job.StartTime.Unix() > req.StopTime {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime.Unix()), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if req.State != "" && !req.State.Valid() {
|
||||
handleError(fmt.Errorf("invalid job state: %#v", req.State), http.StatusBadRequest, rw)
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
|
||||
return
|
||||
} else if req.State == "" {
|
||||
req.State = schema.JobStateCompleted
|
||||
@@ -1111,11 +1024,11 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
|
||||
job.Duration = int32(req.StopTime - job.StartTime.Unix())
|
||||
job.State = req.State
|
||||
if err := api.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
handleError(fmt.Errorf("marking job as stopped failed: %w", err), http.StatusInternalServerError, rw)
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
|
||||
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||
|
||||
// Send a response (with status OK). This means that erros that happen from here on forward
|
||||
// can *NOT* be communicated to the client. If reading from a MetricDataRepository or
|
||||
@@ -1130,7 +1043,7 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
|
||||
}
|
||||
|
||||
// Trigger async archiving
|
||||
api.JobRepository.TriggerArchiving(job)
|
||||
archiver.TriggerArchiving(job)
|
||||
}
|
||||
|
||||
func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
@@ -1158,7 +1071,8 @@ func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
} `json:"error"`
|
||||
}
|
||||
|
||||
data, err := api.Resolver.Query().JobMetrics(r.Context(), id, metrics, scopes)
|
||||
resolver := graph.GetResolverInstance()
|
||||
data, err := resolver.Query().JobMetrics(r.Context(), id, metrics, scopes, nil)
|
||||
if err != nil {
|
||||
json.NewEncoder(rw).Encode(Respone{
|
||||
Error: &struct {
|
||||
@@ -1386,6 +1300,69 @@ func (api *RestApi) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// editNotice godoc
|
||||
// @summary Updates or empties the notice box content
|
||||
// @tags User
|
||||
// @description Modifies the content of notice.txt, shown as notice box on the homepage.
|
||||
// @description If more than one formValue is set then only the highest priority field is used.
|
||||
// @description Only accessible from IPs registered with apiAllowedIPs configuration option.
|
||||
// @accept mpfd
|
||||
// @produce plain
|
||||
// @param new-content formData string false "Priority 1: New content to display"
|
||||
// @success 200 {string} string "Success Response Message"
|
||||
// @failure 400 {string} string "Bad Request"
|
||||
// @failure 401 {string} string "Unauthorized"
|
||||
// @failure 403 {string} string "Forbidden"
|
||||
// @failure 422 {string} string "Unprocessable Entity: The user could not be updated"
|
||||
// @failure 500 {string} string "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /notice/ [post]
|
||||
func (api *RestApi) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
err := securedCheck(r)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
http.Error(rw, "Only admins are allowed to update the notice.txt file", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
// Get Value
|
||||
newContent := r.FormValue("new-content")
|
||||
|
||||
// Check FIle
|
||||
noticeExists := util.CheckFileExists("./var/notice.txt")
|
||||
if !noticeExists {
|
||||
ntxt, err := os.Create("./var/notice.txt")
|
||||
if err != nil {
|
||||
log.Errorf("Creating ./var/notice.txt failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
ntxt.Close()
|
||||
}
|
||||
|
||||
if newContent != "" {
|
||||
if err := os.WriteFile("./var/notice.txt", []byte(newContent), 0o666); err != nil {
|
||||
log.Errorf("Writing to ./var/notice.txt failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
} else {
|
||||
rw.Write([]byte("Update Notice Content Success"))
|
||||
}
|
||||
} else {
|
||||
if err := os.WriteFile("./var/notice.txt", []byte(""), 0o666); err != nil {
|
||||
log.Errorf("Writing to ./var/notice.txt failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
} else {
|
||||
rw.Write([]byte("Empty Notice Content Success"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestApi) getJWT(rw http.ResponseWriter, r *http.Request) {
|
||||
err := securedCheck(r)
|
||||
if err != nil {
|
||||
@@ -1446,7 +1423,7 @@ func (api *RestApi) updateConfiguration(rw http.ResponseWriter, r *http.Request)
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
key, value := r.FormValue("key"), r.FormValue("value")
|
||||
|
||||
fmt.Printf("REST > KEY: %#v\nVALUE: %#v\n", key, value)
|
||||
// fmt.Printf("REST > KEY: %#v\nVALUE: %#v\n", key, value)
|
||||
|
||||
if err := repository.GetUserCfgRepo().UpdateConfig(key, value, repository.GetUserFromContext(r.Context())); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
|
||||
94
internal/archiver/archiveWorker.go
Normal file
94
internal/archiver/archiveWorker.go
Normal file
@@ -0,0 +1,94 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package archiver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
var (
|
||||
archivePending sync.WaitGroup
|
||||
archiveChannel chan *schema.Job
|
||||
jobRepo *repository.JobRepository
|
||||
)
|
||||
|
||||
func Start(r *repository.JobRepository) {
|
||||
archiveChannel = make(chan *schema.Job, 128)
|
||||
jobRepo = r
|
||||
|
||||
go archivingWorker()
|
||||
}
|
||||
|
||||
// Archiving worker thread
|
||||
func archivingWorker() {
|
||||
for {
|
||||
select {
|
||||
case job, ok := <-archiveChannel:
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
start := time.Now()
|
||||
// not using meta data, called to load JobMeta into Cache?
|
||||
// will fail if job meta not in repository
|
||||
if _, err := jobRepo.FetchMetadata(job); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
|
||||
// TODO: Maybe use context with cancel/timeout here
|
||||
jobMeta, err := ArchiveJob(job, context.Background())
|
||||
if err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
stmt := sq.Update("job").Where("job.id = ?", job.ID)
|
||||
|
||||
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
// Update the jobs database entry one last time:
|
||||
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
|
||||
if err := jobRepo.Execute(stmt); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
log.Printf("archiving job (dbid: %d) successful", job.ID)
|
||||
archivePending.Done()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Trigger async archiving
|
||||
func TriggerArchiving(job *schema.Job) {
|
||||
if archiveChannel == nil {
|
||||
log.Fatal("Cannot archive without archiving channel. Did you Start the archiver?")
|
||||
}
|
||||
|
||||
archivePending.Add(1)
|
||||
archiveChannel <- job
|
||||
}
|
||||
|
||||
// Wait for background thread to finish pending archiving operations
|
||||
func WaitForArchiving() {
|
||||
// close channel and wait for worker to process remaining jobs
|
||||
archivePending.Wait()
|
||||
}
|
||||
83
internal/archiver/archiver.go
Normal file
83
internal/archiver/archiver.go
Normal file
@@ -0,0 +1,83 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package archiver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
// Writes a running job to the job-archive
|
||||
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
allMetrics := make([]string, 0)
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
for _, mc := range metricConfigs {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||
// FIXME: Add a config option for this
|
||||
if job.NumNodes <= 8 {
|
||||
// This will add the native scope if core scope is not available
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
}
|
||||
|
||||
if job.NumAcc > 0 {
|
||||
scopes = append(scopes, schema.MetricScopeAccelerator)
|
||||
}
|
||||
|
||||
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
|
||||
if err != nil {
|
||||
log.Error("Error wile loading job data for archiving")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobMeta := &schema.JobMeta{
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
Statistics: make(map[string]schema.JobStatistics),
|
||||
}
|
||||
|
||||
for metric, data := range jobData {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// This should never happen ?
|
||||
continue
|
||||
}
|
||||
|
||||
for _, series := range nodeData.Series {
|
||||
avg += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
}
|
||||
|
||||
// Round AVG Result to 2 Digits
|
||||
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: schema.Unit{
|
||||
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||
},
|
||||
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
|
||||
// If the file based archive is disabled,
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if config.Keys.DisableArchive {
|
||||
return jobMeta, nil
|
||||
}
|
||||
|
||||
return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData)
|
||||
}
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"errors"
|
||||
"net/http"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
@@ -26,6 +27,11 @@ type Authenticator interface {
|
||||
Login(user *schema.User, rw http.ResponseWriter, r *http.Request) (*schema.User, error)
|
||||
}
|
||||
|
||||
var (
|
||||
initOnce sync.Once
|
||||
authInstance *Authentication
|
||||
)
|
||||
|
||||
type Authentication struct {
|
||||
sessionStore *sessions.CookieStore
|
||||
LdapAuth *LdapAuthenticator
|
||||
@@ -62,82 +68,111 @@ func (auth *Authentication) AuthViaSession(
|
||||
}, nil
|
||||
}
|
||||
|
||||
func Init() (*Authentication, error) {
|
||||
auth := &Authentication{}
|
||||
func Init() {
|
||||
initOnce.Do(func() {
|
||||
authInstance = &Authentication{}
|
||||
|
||||
sessKey := os.Getenv("SESSION_KEY")
|
||||
if sessKey == "" {
|
||||
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
|
||||
bytes := make([]byte, 32)
|
||||
if _, err := rand.Read(bytes); err != nil {
|
||||
log.Error("Error while initializing authentication -> failed to generate random bytes for session key")
|
||||
return nil, err
|
||||
}
|
||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
||||
} else {
|
||||
bytes, err := base64.StdEncoding.DecodeString(sessKey)
|
||||
if err != nil {
|
||||
log.Error("Error while initializing authentication -> decoding session key failed")
|
||||
return nil, err
|
||||
}
|
||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
||||
}
|
||||
|
||||
if config.Keys.LdapConfig != nil {
|
||||
ldapAuth := &LdapAuthenticator{}
|
||||
if err := ldapAuth.Init(); err != nil {
|
||||
log.Warn("Error while initializing authentication -> ldapAuth init failed")
|
||||
sessKey := os.Getenv("SESSION_KEY")
|
||||
if sessKey == "" {
|
||||
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
|
||||
bytes := make([]byte, 32)
|
||||
if _, err := rand.Read(bytes); err != nil {
|
||||
log.Fatal("Error while initializing authentication -> failed to generate random bytes for session key")
|
||||
}
|
||||
authInstance.sessionStore = sessions.NewCookieStore(bytes)
|
||||
} else {
|
||||
auth.LdapAuth = ldapAuth
|
||||
auth.authenticators = append(auth.authenticators, auth.LdapAuth)
|
||||
}
|
||||
} else {
|
||||
log.Info("Missing LDAP configuration: No LDAP support!")
|
||||
}
|
||||
|
||||
if config.Keys.JwtConfig != nil {
|
||||
auth.JwtAuth = &JWTAuthenticator{}
|
||||
if err := auth.JwtAuth.Init(); err != nil {
|
||||
log.Error("Error while initializing authentication -> jwtAuth init failed")
|
||||
return nil, err
|
||||
bytes, err := base64.StdEncoding.DecodeString(sessKey)
|
||||
if err != nil {
|
||||
log.Fatal("Error while initializing authentication -> decoding session key failed")
|
||||
}
|
||||
authInstance.sessionStore = sessions.NewCookieStore(bytes)
|
||||
}
|
||||
|
||||
jwtSessionAuth := &JWTSessionAuthenticator{}
|
||||
if err := jwtSessionAuth.Init(); err != nil {
|
||||
log.Info("jwtSessionAuth init failed: No JWT login support!")
|
||||
if d, err := time.ParseDuration(config.Keys.SessionMaxAge); err != nil {
|
||||
authInstance.SessionMaxAge = d
|
||||
}
|
||||
|
||||
if config.Keys.LdapConfig != nil {
|
||||
ldapAuth := &LdapAuthenticator{}
|
||||
if err := ldapAuth.Init(); err != nil {
|
||||
log.Warn("Error while initializing authentication -> ldapAuth init failed")
|
||||
} else {
|
||||
authInstance.LdapAuth = ldapAuth
|
||||
authInstance.authenticators = append(authInstance.authenticators, authInstance.LdapAuth)
|
||||
}
|
||||
} else {
|
||||
auth.authenticators = append(auth.authenticators, jwtSessionAuth)
|
||||
log.Info("Missing LDAP configuration: No LDAP support!")
|
||||
}
|
||||
|
||||
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
|
||||
if err := jwtCookieSessionAuth.Init(); err != nil {
|
||||
log.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
|
||||
if config.Keys.JwtConfig != nil {
|
||||
authInstance.JwtAuth = &JWTAuthenticator{}
|
||||
if err := authInstance.JwtAuth.Init(); err != nil {
|
||||
log.Fatal("Error while initializing authentication -> jwtAuth init failed")
|
||||
}
|
||||
|
||||
jwtSessionAuth := &JWTSessionAuthenticator{}
|
||||
if err := jwtSessionAuth.Init(); err != nil {
|
||||
log.Info("jwtSessionAuth init failed: No JWT login support!")
|
||||
} else {
|
||||
authInstance.authenticators = append(authInstance.authenticators, jwtSessionAuth)
|
||||
}
|
||||
|
||||
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
|
||||
if err := jwtCookieSessionAuth.Init(); err != nil {
|
||||
log.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
|
||||
} else {
|
||||
authInstance.authenticators = append(authInstance.authenticators, jwtCookieSessionAuth)
|
||||
}
|
||||
} else {
|
||||
auth.authenticators = append(auth.authenticators, jwtCookieSessionAuth)
|
||||
log.Info("Missing JWT configuration: No JWT token support!")
|
||||
}
|
||||
} else {
|
||||
log.Info("Missing JWT configuration: No JWT token support!")
|
||||
}
|
||||
|
||||
auth.LocalAuth = &LocalAuthenticator{}
|
||||
if err := auth.LocalAuth.Init(); err != nil {
|
||||
log.Error("Error while initializing authentication -> localAuth init failed")
|
||||
return nil, err
|
||||
}
|
||||
auth.authenticators = append(auth.authenticators, auth.LocalAuth)
|
||||
|
||||
return auth, nil
|
||||
authInstance.LocalAuth = &LocalAuthenticator{}
|
||||
if err := authInstance.LocalAuth.Init(); err != nil {
|
||||
log.Fatal("Error while initializing authentication -> localAuth init failed")
|
||||
}
|
||||
authInstance.authenticators = append(authInstance.authenticators, authInstance.LocalAuth)
|
||||
})
|
||||
}
|
||||
|
||||
func persistUser(user *schema.User) {
|
||||
func GetAuthInstance() *Authentication {
|
||||
if authInstance == nil {
|
||||
log.Fatal("Authentication module not initialized!")
|
||||
}
|
||||
|
||||
return authInstance
|
||||
}
|
||||
|
||||
func handleTokenUser(tokenUser *schema.User) {
|
||||
r := repository.GetUserRepository()
|
||||
_, err := r.GetUser(user.Username)
|
||||
dbUser, err := r.GetUser(tokenUser.Username)
|
||||
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%s': %v", user.Username, err)
|
||||
} else if err == sql.ErrNoRows {
|
||||
if err := r.AddUser(user); err != nil {
|
||||
log.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
|
||||
log.Errorf("Error while loading user '%s': %v", tokenUser.Username, err)
|
||||
} else if err == sql.ErrNoRows && config.Keys.JwtConfig.SyncUserOnLogin { // Adds New User
|
||||
if err := r.AddUser(tokenUser); err != nil {
|
||||
log.Errorf("Error while adding user '%s' to DB: %v", tokenUser.Username, err)
|
||||
}
|
||||
} else if err == nil && config.Keys.JwtConfig.UpdateUserOnLogin { // Update Existing User
|
||||
if err := r.UpdateUser(dbUser, tokenUser); err != nil {
|
||||
log.Errorf("Error while updating user '%s' to DB: %v", dbUser.Username, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func handleOIDCUser(OIDCUser *schema.User) {
|
||||
r := repository.GetUserRepository()
|
||||
dbUser, err := r.GetUser(OIDCUser.Username)
|
||||
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
log.Errorf("Error while loading user '%s': %v", OIDCUser.Username, err)
|
||||
} else if err == sql.ErrNoRows && config.Keys.OpenIDConfig.SyncUserOnLogin { // Adds New User
|
||||
if err := r.AddUser(OIDCUser); err != nil {
|
||||
log.Errorf("Error while adding user '%s' to DB: %v", OIDCUser.Username, err)
|
||||
}
|
||||
} else if err == nil && config.Keys.OpenIDConfig.UpdateUserOnLogin { // Update Existing User
|
||||
if err := r.UpdateUser(dbUser, OIDCUser); err != nil {
|
||||
log.Errorf("Error while updating user '%s' to DB: %v", dbUser.Username, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -153,6 +188,10 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
||||
if auth.SessionMaxAge != 0 {
|
||||
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
|
||||
}
|
||||
if config.Keys.HttpsCertFile == "" && config.Keys.HttpsKeyFile == "" {
|
||||
session.Options.Secure = false
|
||||
}
|
||||
session.Options.SameSite = http.SameSiteStrictMode
|
||||
session.Values["username"] = user.Username
|
||||
session.Values["projects"] = user.Projects
|
||||
session.Values["roles"] = user.Roles
|
||||
@@ -166,7 +205,6 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
||||
}
|
||||
|
||||
func (auth *Authentication) Login(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
@@ -203,7 +241,13 @@ func (auth *Authentication) Login(
|
||||
|
||||
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
|
||||
if r.FormValue("redirect") != "" {
|
||||
http.RedirectHandler(r.FormValue("redirect"), http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
http.RedirectHandler("/", http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
@@ -219,31 +263,141 @@ func (auth *Authentication) Auth(
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("authentication failed: %s", err.Error())
|
||||
log.Infof("auth -> authentication failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
|
||||
if user == nil {
|
||||
user, err = auth.AuthViaSession(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("authentication failed: %s", err.Error())
|
||||
log.Infof("auth -> authentication failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if user != nil {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
log.Debug("authentication failed")
|
||||
log.Info("auth -> authentication failed")
|
||||
onfailure(rw, r, errors.New("unauthorized (please login first)"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthApi(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("auth api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
default:
|
||||
log.Info("auth api -> authentication failed: missing role")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
}
|
||||
}
|
||||
log.Info("auth api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthUserApi(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("auth user api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
default:
|
||||
log.Info("auth user api -> authentication failed: missing role")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
}
|
||||
}
|
||||
log.Info("auth user api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthConfigApi(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.AuthViaSession(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("auth config api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
if user != nil && user.HasRole(schema.RoleAdmin) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
log.Info("auth config api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) AuthFrontendApi(
|
||||
onsuccess http.Handler,
|
||||
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
|
||||
) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.AuthViaSession(rw, r)
|
||||
if err != nil {
|
||||
log.Infof("auth frontend api -> authentication failed: %s", err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
if user != nil {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
log.Info("auth frontend api -> authentication failed: no auth")
|
||||
onfailure(rw, r, errors.New("unauthorized"))
|
||||
})
|
||||
}
|
||||
|
||||
func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
session, err := auth.sessionStore.Get(r, "session")
|
||||
|
||||
@@ -198,8 +198,8 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
AuthSource: schema.AuthViaToken,
|
||||
}
|
||||
|
||||
if jc.SyncUserOnLogin {
|
||||
persistUser(user)
|
||||
if jc.SyncUserOnLogin || jc.UpdateUserOnLogin {
|
||||
handleTokenUser(user)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -138,8 +138,8 @@ func (ja *JWTSessionAuthenticator) Login(
|
||||
AuthSource: schema.AuthViaToken,
|
||||
}
|
||||
|
||||
if config.Keys.JwtConfig.SyncUserOnLogin {
|
||||
persistUser(user)
|
||||
if config.Keys.JwtConfig.SyncUserOnLogin || config.Keys.JwtConfig.UpdateUserOnLogin {
|
||||
handleTokenUser(user)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
@@ -34,33 +33,6 @@ func (la *LdapAuthenticator) Init() error {
|
||||
|
||||
lc := config.Keys.LdapConfig
|
||||
|
||||
if lc.SyncInterval != "" {
|
||||
interval, err := time.ParseDuration(lc.SyncInterval)
|
||||
if err != nil {
|
||||
log.Warnf("Could not parse duration for sync interval: %v",
|
||||
lc.SyncInterval)
|
||||
return err
|
||||
}
|
||||
|
||||
if interval == 0 {
|
||||
log.Info("Sync interval is zero")
|
||||
return nil
|
||||
}
|
||||
|
||||
go func() {
|
||||
ticker := time.NewTicker(interval)
|
||||
for t := range ticker.C {
|
||||
log.Printf("sync started at %s", t.Format(time.RFC3339))
|
||||
if err := la.Sync(); err != nil {
|
||||
log.Errorf("sync failed: %s", err.Error())
|
||||
}
|
||||
log.Print("sync done")
|
||||
}
|
||||
}()
|
||||
} else {
|
||||
log.Info("LDAP configuration key sync_interval invalid")
|
||||
}
|
||||
|
||||
if lc.UserAttr != "" {
|
||||
la.UserAttr = lc.UserAttr
|
||||
} else {
|
||||
|
||||
@@ -168,8 +168,8 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
||||
AuthSource: schema.AuthViaOIDC,
|
||||
}
|
||||
|
||||
if config.Keys.OpenIDConfig.SyncUserOnLogin {
|
||||
persistUser(user)
|
||||
if config.Keys.OpenIDConfig.SyncUserOnLogin || config.Keys.OpenIDConfig.UpdateUserOnLogin {
|
||||
handleOIDCUser(user)
|
||||
}
|
||||
|
||||
oa.authentication.SaveSession(rw, r, user)
|
||||
|
||||
@@ -29,10 +29,9 @@ var Keys schema.ProgramConfig = schema.ProgramConfig{
|
||||
"analysis_view_histogramMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"analysis_view_scatterPlotMetrics": [][]string{{"flops_any", "mem_bw"}, {"flops_any", "cpu_load"}, {"cpu_load", "mem_bw"}},
|
||||
"job_view_nodestats_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"job_view_polarPlotMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"job_view_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
|
||||
"job_view_showFootprint": true,
|
||||
"job_list_usePaging": true,
|
||||
"job_list_usePaging": false,
|
||||
"plot_general_colorBackground": true,
|
||||
"plot_general_colorscheme": []string{"#00bfff", "#0000ff", "#ff00ff", "#ff0000", "#ff8000", "#ffff00", "#80ff00"},
|
||||
"plot_general_lineWidth": 3,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -16,11 +16,23 @@ type Count struct {
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
type EnergyFootprintValue struct {
|
||||
Hardware string `json:"hardware"`
|
||||
Metric string `json:"metric"`
|
||||
Value float64 `json:"value"`
|
||||
}
|
||||
|
||||
type FloatRange struct {
|
||||
From float64 `json:"from"`
|
||||
To float64 `json:"to"`
|
||||
}
|
||||
|
||||
type FootprintValue struct {
|
||||
Name string `json:"name"`
|
||||
Stat string `json:"stat"`
|
||||
Value float64 `json:"value"`
|
||||
}
|
||||
|
||||
type Footprints struct {
|
||||
TimeWeights *TimeWeights `json:"timeWeights"`
|
||||
Metrics []*MetricFootprints `json:"metrics"`
|
||||
@@ -46,16 +58,14 @@ type JobFilter struct {
|
||||
Cluster *StringInput `json:"cluster,omitempty"`
|
||||
Partition *StringInput `json:"partition,omitempty"`
|
||||
Duration *schema.IntRange `json:"duration,omitempty"`
|
||||
Energy *FloatRange `json:"energy,omitempty"`
|
||||
MinRunningFor *int `json:"minRunningFor,omitempty"`
|
||||
NumNodes *schema.IntRange `json:"numNodes,omitempty"`
|
||||
NumAccelerators *schema.IntRange `json:"numAccelerators,omitempty"`
|
||||
NumHWThreads *schema.IntRange `json:"numHWThreads,omitempty"`
|
||||
StartTime *schema.TimeRange `json:"startTime,omitempty"`
|
||||
State []schema.JobState `json:"state,omitempty"`
|
||||
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg,omitempty"`
|
||||
MemBwAvg *FloatRange `json:"memBwAvg,omitempty"`
|
||||
LoadAvg *FloatRange `json:"loadAvg,omitempty"`
|
||||
MemUsedMax *FloatRange `json:"memUsedMax,omitempty"`
|
||||
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
|
||||
Exclusive *int `json:"exclusive,omitempty"`
|
||||
Node *StringInput `json:"node,omitempty"`
|
||||
}
|
||||
@@ -120,9 +130,15 @@ type MetricHistoPoint struct {
|
||||
type MetricHistoPoints struct {
|
||||
Metric string `json:"metric"`
|
||||
Unit string `json:"unit"`
|
||||
Stat *string `json:"stat,omitempty"`
|
||||
Data []*MetricHistoPoint `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
type MetricStatItem struct {
|
||||
MetricName string `json:"metricName"`
|
||||
Range *FloatRange `json:"range"`
|
||||
}
|
||||
|
||||
type Mutation struct {
|
||||
}
|
||||
|
||||
@@ -132,8 +148,18 @@ type NodeMetrics struct {
|
||||
Metrics []*JobMetricWithName `json:"metrics"`
|
||||
}
|
||||
|
||||
type NodesResultList struct {
|
||||
Items []*NodeMetrics `json:"items"`
|
||||
Offset *int `json:"offset,omitempty"`
|
||||
Limit *int `json:"limit,omitempty"`
|
||||
Count *int `json:"count,omitempty"`
|
||||
TotalNodes *int `json:"totalNodes,omitempty"`
|
||||
HasNextPage *bool `json:"hasNextPage,omitempty"`
|
||||
}
|
||||
|
||||
type OrderByInput struct {
|
||||
Field string `json:"field"`
|
||||
Type string `json:"type"`
|
||||
Order SortDirectionEnum `json:"order"`
|
||||
}
|
||||
|
||||
@@ -155,8 +181,9 @@ type StringInput struct {
|
||||
}
|
||||
|
||||
type TimeRangeOutput struct {
|
||||
From time.Time `json:"from"`
|
||||
To time.Time `json:"to"`
|
||||
Range *string `json:"range,omitempty"`
|
||||
From time.Time `json:"from"`
|
||||
To time.Time `json:"to"`
|
||||
}
|
||||
|
||||
type TimeWeights struct {
|
||||
|
||||
@@ -1,15 +1,39 @@
|
||||
package graph
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
// This file will not be regenerated automatically.
|
||||
//
|
||||
// It serves as dependency injection for your app, add any dependencies you require here.
|
||||
var (
|
||||
initOnce sync.Once
|
||||
resolverInstance *Resolver
|
||||
)
|
||||
|
||||
type Resolver struct {
|
||||
DB *sqlx.DB
|
||||
Repo *repository.JobRepository
|
||||
}
|
||||
|
||||
func Init() {
|
||||
initOnce.Do(func() {
|
||||
db := repository.GetConnection()
|
||||
resolverInstance = &Resolver{
|
||||
DB: db.DB, Repo: repository.GetJobRepository(),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func GetResolverInstance() *Resolver {
|
||||
if resolverInstance == nil {
|
||||
log.Fatal("Authentication module not initialized!")
|
||||
}
|
||||
|
||||
return resolverInstance
|
||||
}
|
||||
|
||||
@@ -2,19 +2,22 @@ package graph
|
||||
|
||||
// This file will be automatically regenerated based on the schema, any resolver implementations
|
||||
// will be copied through when generating and any unknown code will be moved to the end.
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.45
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.57
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
@@ -28,15 +31,12 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) (
|
||||
|
||||
// Tags is the resolver for the tags field.
|
||||
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
|
||||
return r.Repo.GetTags(&obj.ID)
|
||||
return r.Repo.GetTags(repository.GetUserFromContext(ctx), &obj.ID)
|
||||
}
|
||||
|
||||
// ConcurrentJobs is the resolver for the concurrentJobs field.
|
||||
func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) {
|
||||
if obj.State == schema.JobStateRunning {
|
||||
obj.Duration = int32(time.Now().Unix() - obj.StartTimeUnix)
|
||||
}
|
||||
|
||||
// FIXME: Make the hardcoded duration configurable
|
||||
if obj.Exclusive != 1 && obj.Duration > 600 {
|
||||
return r.Repo.FindConcurrentJobs(ctx, obj)
|
||||
}
|
||||
@@ -44,8 +44,72 @@ func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*mod
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Footprint is the resolver for the footprint field.
|
||||
func (r *jobResolver) Footprint(ctx context.Context, obj *schema.Job) ([]*model.FootprintValue, error) {
|
||||
rawFootprint, err := r.Repo.FetchFootprint(obj)
|
||||
if err != nil {
|
||||
log.Warn("Error while fetching job footprint data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.FootprintValue{}
|
||||
for name, value := range rawFootprint {
|
||||
|
||||
parts := strings.Split(name, "_")
|
||||
statPart := parts[len(parts)-1]
|
||||
nameParts := parts[:len(parts)-1]
|
||||
|
||||
res = append(res, &model.FootprintValue{
|
||||
Name: strings.Join(nameParts, "_"),
|
||||
Stat: statPart,
|
||||
Value: value,
|
||||
})
|
||||
}
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
// EnergyFootprint is the resolver for the energyFootprint field.
|
||||
func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*model.EnergyFootprintValue, error) {
|
||||
rawEnergyFootprint, err := r.Repo.FetchEnergyFootprint(obj)
|
||||
if err != nil {
|
||||
log.Warn("Error while fetching job energy footprint data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.EnergyFootprintValue{}
|
||||
for name, value := range rawEnergyFootprint {
|
||||
// Suboptimal: Nearly hardcoded metric name expectations
|
||||
matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`)
|
||||
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
|
||||
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
|
||||
matchCore := regexp.MustCompile(`core|Core|CORE`)
|
||||
|
||||
hwType := ""
|
||||
switch test := name; { // NOtice ';' for var declaration
|
||||
case matchCpu.MatchString(test):
|
||||
hwType = "CPU"
|
||||
case matchAcc.MatchString(test):
|
||||
hwType = "Accelerator"
|
||||
case matchMem.MatchString(test):
|
||||
hwType = "Memory"
|
||||
case matchCore.MatchString(test):
|
||||
hwType = "Core"
|
||||
default:
|
||||
hwType = "Other"
|
||||
}
|
||||
|
||||
res = append(res, &model.EnergyFootprintValue{
|
||||
Hardware: hwType,
|
||||
Metric: name,
|
||||
Value: value,
|
||||
})
|
||||
}
|
||||
return res, err
|
||||
}
|
||||
|
||||
// MetaData is the resolver for the metaData field.
|
||||
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) {
|
||||
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (any, error) {
|
||||
return r.Repo.FetchMetadata(obj)
|
||||
}
|
||||
|
||||
@@ -54,15 +118,20 @@ func (r *jobResolver) UserData(ctx context.Context, obj *schema.Job) (*model.Use
|
||||
return repository.GetUserRepository().FetchUserInCtx(ctx, obj.User)
|
||||
}
|
||||
|
||||
// Name is the resolver for the name field.
|
||||
func (r *metricValueResolver) Name(ctx context.Context, obj *schema.MetricValue) (*string, error) {
|
||||
panic(fmt.Errorf("not implemented: Name - name"))
|
||||
}
|
||||
|
||||
// CreateTag is the resolver for the createTag field.
|
||||
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
|
||||
id, err := r.Repo.CreateTag(typeArg, name)
|
||||
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string, scope string) (*schema.Tag, error) {
|
||||
id, err := r.Repo.CreateTag(typeArg, name, scope)
|
||||
if err != nil {
|
||||
log.Warn("Error while creating tag")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
|
||||
return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil
|
||||
}
|
||||
|
||||
// DeleteTag is the resolver for the deleteTag field.
|
||||
@@ -72,6 +141,7 @@ func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, er
|
||||
|
||||
// AddTagsToJob is the resolver for the addTagsToJob field.
|
||||
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
// Selectable Tags Pre-Filtered by Scope in Frontend: No backend check required
|
||||
jid, err := strconv.ParseInt(job, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while adding tag to job")
|
||||
@@ -86,7 +156,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if tags, err = r.Repo.AddTag(jid, tid); err != nil {
|
||||
if tags, err = r.Repo.AddTag(repository.GetUserFromContext(ctx), jid, tid); err != nil {
|
||||
log.Warn("Error while adding tag")
|
||||
return nil, err
|
||||
}
|
||||
@@ -97,6 +167,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
|
||||
|
||||
// RemoveTagsFromJob is the resolver for the removeTagsFromJob field.
|
||||
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
// Removable Tags Pre-Filtered by Scope in Frontend: No backend check required
|
||||
jid, err := strconv.ParseInt(job, 10, 64)
|
||||
if err != nil {
|
||||
log.Warn("Error while parsing job id")
|
||||
@@ -111,7 +182,7 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if tags, err = r.Repo.RemoveTag(jid, tid); err != nil {
|
||||
if tags, err = r.Repo.RemoveTag(repository.GetUserFromContext(ctx), jid, tid); err != nil {
|
||||
log.Warn("Error while removing tag")
|
||||
return nil, err
|
||||
}
|
||||
@@ -137,7 +208,12 @@ func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error)
|
||||
|
||||
// Tags is the resolver for the tags field.
|
||||
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
|
||||
return r.Repo.GetTags(nil)
|
||||
return r.Repo.GetTags(repository.GetUserFromContext(ctx), nil)
|
||||
}
|
||||
|
||||
// GlobalMetrics is the resolver for the globalMetrics field.
|
||||
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
|
||||
return archive.GlobalMetricList, nil
|
||||
}
|
||||
|
||||
// User is the resolver for the user field.
|
||||
@@ -172,7 +248,7 @@ func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job, err := r.Repo.FindById(numericId)
|
||||
job, err := r.Repo.FindById(ctx, numericId)
|
||||
if err != nil {
|
||||
log.Warn("Error while finding job by id")
|
||||
return nil, err
|
||||
@@ -188,14 +264,24 @@ func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error)
|
||||
}
|
||||
|
||||
// JobMetrics is the resolver for the jobMetrics field.
|
||||
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.JobMetricWithName, error) {
|
||||
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope, resolution *int) ([]*model.JobMetricWithName, error) {
|
||||
if resolution == nil { // Load from Config
|
||||
if config.Keys.EnableResampling != nil {
|
||||
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
|
||||
resolution = &defaultRes
|
||||
} else { // Set 0 (Loads configured metric timestep)
|
||||
defaultRes := 0
|
||||
resolution = &defaultRes
|
||||
}
|
||||
}
|
||||
|
||||
job, err := r.Query().Job(ctx, id)
|
||||
if err != nil {
|
||||
log.Warn("Error while querying job for metrics")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricdata.LoadData(job, metrics, scopes, ctx)
|
||||
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading job data")
|
||||
return nil, err
|
||||
@@ -217,6 +303,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
|
||||
|
||||
// JobsFootprints is the resolver for the jobsFootprints field.
|
||||
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||
// NOTE: Legacy Naming! This resolver is for normalized histograms in analysis view only - *Not* related to DB "footprint" column!
|
||||
return r.jobsFootprints(ctx, filter, metrics)
|
||||
}
|
||||
|
||||
@@ -268,10 +355,14 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
|
||||
}
|
||||
|
||||
// JobsStatistics is the resolver for the jobsStatistics field.
|
||||
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
|
||||
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate, numDurationBins *string, numMetricBins *int) ([]*model.JobsStatistics, error) {
|
||||
var err error
|
||||
var stats []*model.JobsStatistics
|
||||
|
||||
// Top Level Defaults
|
||||
var defaultDurationBins string = "1h"
|
||||
var defaultMetricBins int = 10
|
||||
|
||||
if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
|
||||
requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") {
|
||||
if groupBy == nil {
|
||||
@@ -305,8 +396,13 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
}
|
||||
|
||||
if requireField(ctx, "histDuration") || requireField(ctx, "histNumNodes") || requireField(ctx, "histNumCores") || requireField(ctx, "histNumAccs") {
|
||||
|
||||
if numDurationBins == nil {
|
||||
numDurationBins = &defaultDurationBins
|
||||
}
|
||||
|
||||
if groupBy == nil {
|
||||
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0])
|
||||
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0], numDurationBins)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -316,8 +412,13 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
}
|
||||
|
||||
if requireField(ctx, "histMetrics") {
|
||||
|
||||
if numMetricBins == nil {
|
||||
numMetricBins = &defaultMetricBins
|
||||
}
|
||||
|
||||
if groupBy == nil {
|
||||
stats[0], err = r.Repo.AddMetricHistograms(ctx, filter, metrics, stats[0])
|
||||
stats[0], err = r.Repo.AddMetricHistograms(ctx, filter, metrics, stats[0], numMetricBins)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -337,8 +438,8 @@ func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.Job
|
||||
// NodeMetrics is the resolver for the nodeMetrics field.
|
||||
func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) ([]*model.NodeMetrics, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user != nil && !user.HasRole(schema.RoleAdmin) {
|
||||
return nil, errors.New("you need to be an administrator for this query")
|
||||
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
@@ -347,9 +448,9 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
}
|
||||
}
|
||||
|
||||
data, err := metricdata.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading node data")
|
||||
log.Warn("error while loading node data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -359,7 +460,10 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
Host: hostname,
|
||||
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
|
||||
}
|
||||
host.SubCluster, _ = archive.GetSubClusterByNode(cluster, hostname)
|
||||
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
|
||||
if err != nil {
|
||||
log.Warnf("error in nodeMetrics resolver: %s", err)
|
||||
}
|
||||
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for _, scopedMetric := range scopedMetrics {
|
||||
@@ -377,6 +481,68 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
return nodeMetrics, nil
|
||||
}
|
||||
|
||||
// NodeMetricsList is the resolver for the nodeMetricsList field.
|
||||
func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, subCluster string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) (*model.NodesResultList, error) {
|
||||
if resolution == nil { // Load from Config
|
||||
if config.Keys.EnableResampling != nil {
|
||||
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
|
||||
resolution = &defaultRes
|
||||
} else { // Set 0 (Loads configured metric timestep)
|
||||
defaultRes := 0
|
||||
resolution = &defaultRes
|
||||
}
|
||||
}
|
||||
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, totalNodes, hasNextPage, err := metricDataDispatcher.LoadNodeListData(cluster, subCluster, nodeFilter, metrics, scopes, *resolution, from, to, page, ctx)
|
||||
if err != nil {
|
||||
log.Warn("error while loading node data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
|
||||
for hostname, metrics := range data {
|
||||
host := &model.NodeMetrics{
|
||||
Host: hostname,
|
||||
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
|
||||
}
|
||||
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
|
||||
if err != nil {
|
||||
log.Warnf("error in nodeMetrics resolver: %s", err)
|
||||
}
|
||||
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for scope, scopedMetric := range scopedMetrics {
|
||||
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
|
||||
Name: metric,
|
||||
Scope: scope,
|
||||
Metric: scopedMetric,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
nodeMetricsList = append(nodeMetricsList, host)
|
||||
}
|
||||
|
||||
nodeMetricsListResult := &model.NodesResultList{
|
||||
Items: nodeMetricsList,
|
||||
TotalNodes: &totalNodes,
|
||||
HasNextPage: &hasNextPage,
|
||||
}
|
||||
|
||||
return nodeMetricsListResult, nil
|
||||
}
|
||||
|
||||
// NumberOfNodes is the resolver for the numberOfNodes field.
|
||||
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
|
||||
nodeList, err := archive.ParseNodeList(obj.Nodes)
|
||||
@@ -392,6 +558,9 @@ func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver
|
||||
// Job returns generated.JobResolver implementation.
|
||||
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
|
||||
|
||||
// MetricValue returns generated.MetricValueResolver implementation.
|
||||
func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricValueResolver{r} }
|
||||
|
||||
// Mutation returns generated.MutationResolver implementation.
|
||||
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
|
||||
|
||||
@@ -403,6 +572,7 @@ func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subCluste
|
||||
|
||||
type clusterResolver struct{ *Resolver }
|
||||
type jobResolver struct{ *Resolver }
|
||||
type metricValueResolver struct{ *Resolver }
|
||||
type mutationResolver struct{ *Resolver }
|
||||
type queryResolver struct{ *Resolver }
|
||||
type subClusterResolver struct{ *Resolver }
|
||||
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
|
||||
"github.com/99designs/gqlgen/graphql"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
// "github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
@@ -24,8 +24,8 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
ctx context.Context,
|
||||
filter []*model.JobFilter,
|
||||
rows int, cols int,
|
||||
minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
|
||||
|
||||
minX float64, minY float64, maxX float64, maxY float64,
|
||||
) ([][]float64, error) {
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
||||
if err != nil {
|
||||
log.Error("Error while querying jobs for roofline")
|
||||
@@ -47,7 +47,14 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
continue
|
||||
}
|
||||
|
||||
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
|
||||
// metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
// resolution := 0
|
||||
|
||||
// for _, mc := range metricConfigs {
|
||||
// resolution = max(resolution, mc.Timestep)
|
||||
// }
|
||||
|
||||
jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
|
||||
if err != nil {
|
||||
log.Errorf("Error while loading roofline metrics for job %d", job.ID)
|
||||
return nil, err
|
||||
@@ -120,7 +127,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
||||
continue
|
||||
}
|
||||
|
||||
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
log.Error("Error while loading averages for footprint")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -8,9 +8,9 @@ import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
@@ -42,8 +42,8 @@ func HandleImportFlag(flag string) error {
|
||||
}
|
||||
dec := json.NewDecoder(bytes.NewReader(raw))
|
||||
dec.DisallowUnknownFields()
|
||||
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err = dec.Decode(&jobMeta); err != nil {
|
||||
job := schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err = dec.Decode(&job); err != nil {
|
||||
log.Warn("Error while decoding raw json metadata for import")
|
||||
return err
|
||||
}
|
||||
@@ -67,32 +67,60 @@ func HandleImportFlag(flag string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// checkJobData(&jobData)
|
||||
job.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
|
||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
|
||||
// if _, err = r.Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
|
||||
// if err != nil {
|
||||
// log.Warn("Error while finding job in jobRepository")
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// return fmt.Errorf("REPOSITORY/INIT > a job with that jobId, cluster and startTime does already exist")
|
||||
// }
|
||||
//
|
||||
job := schema.Job{
|
||||
BaseJob: jobMeta.BaseJob,
|
||||
StartTime: time.Unix(jobMeta.StartTime, 0),
|
||||
StartTimeUnix: jobMeta.StartTime,
|
||||
sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster)
|
||||
if err != nil {
|
||||
log.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
// TODO: Other metrics...
|
||||
job.LoadAvg = loadJobStat(&jobMeta, "cpu_load")
|
||||
job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any")
|
||||
job.MemUsedMax = loadJobStat(&jobMeta, "mem_used")
|
||||
job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw")
|
||||
job.NetBwAvg = loadJobStat(&jobMeta, "net_bw")
|
||||
job.FileBwAvg = loadJobStat(&jobMeta, "file_bw")
|
||||
job.Footprint = make(map[string]float64)
|
||||
|
||||
for _, fp := range sc.Footprint {
|
||||
statType := "avg"
|
||||
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
|
||||
statType = sc.MetricConfig[i].Footprint
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||
|
||||
job.Footprint[name] = repository.LoadJobStat(&job, fp, statType)
|
||||
}
|
||||
|
||||
job.RawFootprint, err = json.Marshal(job.Footprint)
|
||||
if err != nil {
|
||||
log.Warn("Error while marshaling job footprint")
|
||||
return err
|
||||
}
|
||||
|
||||
job.EnergyFootprint = make(map[string]float64)
|
||||
var totalEnergy float64
|
||||
var energy float64
|
||||
|
||||
for _, fp := range sc.EnergyFootprint {
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
|
||||
// Note: For DB data, calculate and save as kWh
|
||||
// Energy: Power (in Watts) * Time (in Seconds)
|
||||
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules)
|
||||
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
|
||||
// Unit: ( W * s ) / 3600 / 1000 = kWh ; Rounded to 2 nearest digits
|
||||
energy = math.Round(((repository.LoadJobStat(&job, fp, "avg")*float64(job.Duration))/3600/1000)*100) / 100
|
||||
}
|
||||
} else {
|
||||
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
|
||||
}
|
||||
|
||||
job.EnergyFootprint[fp] = energy
|
||||
totalEnergy += energy
|
||||
}
|
||||
|
||||
job.Energy = (math.Round(totalEnergy*100) / 100)
|
||||
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
|
||||
log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
@@ -110,7 +138,7 @@ func HandleImportFlag(flag string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if err = archive.GetHandle().ImportJob(&jobMeta, &jobData); err != nil {
|
||||
if err = archive.GetHandle().ImportJob(&job, &jobData); err != nil {
|
||||
log.Error("Error while importing job")
|
||||
return err
|
||||
}
|
||||
@@ -122,8 +150,8 @@ func HandleImportFlag(flag string) error {
|
||||
}
|
||||
|
||||
for _, tag := range job.Tags {
|
||||
if _, err := r.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
|
||||
log.Error("Error while adding or creating tag")
|
||||
if err := r.ImportTag(id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||
log.Error("Error while adding or creating tag on import")
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -82,7 +82,7 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
if err := os.Mkdir(jobarchive, 0777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
fritzArchive := filepath.Join(tmpdir, "job-archive", "fritz")
|
||||
|
||||
@@ -7,6 +7,7 @@ package importer
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -16,6 +17,11 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
const (
|
||||
addTagQuery = "INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)"
|
||||
setTagQuery = "INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)"
|
||||
)
|
||||
|
||||
// Delete the tables "job", "tag" and "jobtag" from the database and
|
||||
// repopulate them using the jobs found in `archive`.
|
||||
func InitDB() error {
|
||||
@@ -60,13 +66,58 @@ func InitDB() error {
|
||||
StartTimeUnix: jobMeta.StartTime,
|
||||
}
|
||||
|
||||
// TODO: Other metrics...
|
||||
job.LoadAvg = loadJobStat(jobMeta, "cpu_load")
|
||||
job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any")
|
||||
job.MemUsedMax = loadJobStat(jobMeta, "mem_used")
|
||||
job.MemBwAvg = loadJobStat(jobMeta, "mem_bw")
|
||||
job.NetBwAvg = loadJobStat(jobMeta, "net_bw")
|
||||
job.FileBwAvg = loadJobStat(jobMeta, "file_bw")
|
||||
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
|
||||
if err != nil {
|
||||
log.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
job.Footprint = make(map[string]float64)
|
||||
|
||||
for _, fp := range sc.Footprint {
|
||||
statType := "avg"
|
||||
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
|
||||
statType = sc.MetricConfig[i].Footprint
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||
|
||||
job.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType)
|
||||
}
|
||||
|
||||
job.RawFootprint, err = json.Marshal(job.Footprint)
|
||||
if err != nil {
|
||||
log.Warn("Error while marshaling job footprint")
|
||||
return err
|
||||
}
|
||||
|
||||
job.EnergyFootprint = make(map[string]float64)
|
||||
var totalEnergy float64
|
||||
var energy float64
|
||||
|
||||
for _, fp := range sc.EnergyFootprint {
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
|
||||
// Note: For DB data, calculate and save as kWh
|
||||
// Energy: Power (in Watts) * Time (in Seconds)
|
||||
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules)
|
||||
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
|
||||
// Unit: ( W * s ) / 3600 / 1000 = kWh ; Rounded to 2 nearest digits
|
||||
energy = math.Round(((repository.LoadJobStat(jobMeta, fp, "avg")*float64(jobMeta.Duration))/3600/1000)*100) / 100
|
||||
}
|
||||
} else {
|
||||
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)
|
||||
}
|
||||
|
||||
job.EnergyFootprint[fp] = energy
|
||||
totalEnergy += energy
|
||||
}
|
||||
|
||||
job.Energy = (math.Round(totalEnergy*100) / 100)
|
||||
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
|
||||
log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
@@ -88,7 +139,8 @@ func InitDB() error {
|
||||
continue
|
||||
}
|
||||
|
||||
id, err := r.TransactionAdd(t, job)
|
||||
id, err := r.TransactionAddNamed(t,
|
||||
repository.NamedJobInsert, job)
|
||||
if err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
errorOccured++
|
||||
@@ -99,7 +151,9 @@ func InitDB() error {
|
||||
tagstr := tag.Name + ":" + tag.Type
|
||||
tagId, ok := tags[tagstr]
|
||||
if !ok {
|
||||
tagId, err = r.TransactionAddTag(t, tag)
|
||||
tagId, err = r.TransactionAdd(t,
|
||||
addTagQuery,
|
||||
tag.Name, tag.Type)
|
||||
if err != nil {
|
||||
log.Errorf("Error adding tag: %v", err)
|
||||
errorOccured++
|
||||
@@ -108,7 +162,9 @@ func InitDB() error {
|
||||
tags[tagstr] = tagId
|
||||
}
|
||||
|
||||
r.TransactionSetTag(t, id, tagId)
|
||||
r.TransactionAdd(t,
|
||||
setTagQuery,
|
||||
id, tagId)
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
@@ -150,18 +206,6 @@ func SanityChecks(job *schema.BaseJob) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func loadJobStat(job *schema.JobMeta, metric string) float64 {
|
||||
if stats, ok := job.Statistics[metric]; ok {
|
||||
if metric == "mem_used" {
|
||||
return stats.Max
|
||||
} else {
|
||||
return stats.Avg
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0
|
||||
}
|
||||
|
||||
func checkJobData(d *schema.JobData) error {
|
||||
for _, scopes := range *d {
|
||||
// var newUnit schema.Unit
|
||||
|
||||
1486
internal/importer/testdata/cluster-fritz.json
vendored
1486
internal/importer/testdata/cluster-fritz.json
vendored
File diff suppressed because it is too large
Load Diff
310
internal/metricDataDispatcher/dataLoader.go
Normal file
310
internal/metricDataDispatcher/dataLoader.go
Normal file
@@ -0,0 +1,310 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package metricDataDispatcher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/resampler"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
func cacheKey(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) string {
|
||||
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
|
||||
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
|
||||
job.ID, job.State, metrics, scopes, resolution)
|
||||
}
|
||||
|
||||
// Fetches the metric data for a job.
|
||||
func LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ interface{}, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
||||
config.Keys.DisableArchive {
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
|
||||
}
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := archive.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = repo.LoadData(job, metrics, scopes, ctx, resolution)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
log.Warnf("partial error: %s", err.Error())
|
||||
// return err, 0, 0 // Reactivating will block archiving on one partial error
|
||||
} else {
|
||||
log.Error("Error while loading job data from metric repository")
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
var jd_temp schema.JobData
|
||||
jd_temp, err = archive.GetHandle().LoadJobData(job)
|
||||
if err != nil {
|
||||
log.Error("Error while loading job data from archive")
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
//Deep copy the cached archive hashmap
|
||||
jd = metricdata.DeepCopy(jd_temp)
|
||||
|
||||
//Resampling for archived data.
|
||||
//Pass the resolution from frontend here.
|
||||
for _, v := range jd {
|
||||
for _, v_ := range v {
|
||||
timestep := 0
|
||||
for i := 0; i < len(v_.Series); i += 1 {
|
||||
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, v_.Timestep, resolution)
|
||||
if err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
v_.Timestep = timestep
|
||||
}
|
||||
}
|
||||
|
||||
// Avoid sending unrequested data to the client:
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = jd.Size()
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
// FIXME: Review: Is this really necessary or correct.
|
||||
// Note: Lines 147-170 formerly known as prepareJobData(jobData, scopes)
|
||||
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
|
||||
// to be available at the scope 'node'. If a job has a lot of nodes,
|
||||
// statisticsSeries should be available so that a min/median/max Graph can be
|
||||
// used instead of a lot of single lines.
|
||||
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
|
||||
// Existing (archived) StatsSeries can be 'min/mean/max'!
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jd.AddNodeScope("flops_any")
|
||||
jd.AddNodeScope("mem_bw")
|
||||
}
|
||||
|
||||
// Round Resulting Stat Values
|
||||
jd.RoundMetricStats()
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
log.Error("Error in returned dataset")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
||||
func LoadAverages(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
data [][]schema.Float,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
|
||||
}
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
|
||||
if err != nil {
|
||||
log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Used for the classic node/system view. Returns a map of nodes to a map of metrics.
|
||||
func LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
repo, err := metricdata.GetMetricDataRepo(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
log.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
log.Error("Error while loading node data from metric repository")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func LoadNodeListData(
|
||||
cluster, subCluster, nodeFilter string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
page *model.PageRequest,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, int, bool, error) {
|
||||
repo, err := metricdata.GetMetricDataRepo(cluster)
|
||||
if err != nil {
|
||||
return nil, 0, false, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, totalNodes, hasNextPage, err := repo.LoadNodeListData(cluster, subCluster, nodeFilter, metrics, scopes, resolution, from, to, page, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
log.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
log.Error("Error while loading node data from metric repository")
|
||||
return nil, totalNodes, hasNextPage, err
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
|
||||
const maxSeriesSize int = 8
|
||||
for _, jd := range data {
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, totalNodes, hasNextPage, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, totalNodes, hasNextPage, nil
|
||||
}
|
||||
@@ -11,10 +11,12 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
@@ -55,6 +57,7 @@ type ApiQuery struct {
|
||||
SubType *string `json:"subtype,omitempty"`
|
||||
Metric string `json:"metric"`
|
||||
Hostname string `json:"host"`
|
||||
Resolution int `json:"resolution"`
|
||||
TypeIds []string `json:"type-ids,omitempty"`
|
||||
SubTypeIds []string `json:"subtype-ids,omitempty"`
|
||||
Aggregate bool `json:"aggreg"`
|
||||
@@ -66,13 +69,14 @@ type ApiQueryResponse struct {
|
||||
}
|
||||
|
||||
type ApiMetricData struct {
|
||||
Error *string `json:"error"`
|
||||
Data []schema.Float `json:"data"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
Avg schema.Float `json:"avg"`
|
||||
Min schema.Float `json:"min"`
|
||||
Max schema.Float `json:"max"`
|
||||
Error *string `json:"error"`
|
||||
Data []schema.Float `json:"data"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
Resolution int `json:"resolution"`
|
||||
Avg schema.Float `json:"avg"`
|
||||
Min schema.Float `json:"min"`
|
||||
Max schema.Float `json:"max"`
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) Init(rawConfig json.RawMessage) error {
|
||||
@@ -129,7 +133,7 @@ func (ccms *CCMetricStore) doRequest(
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, ccms.queryEndpoint, buf)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf)
|
||||
if err != nil {
|
||||
log.Warn("Error while building request body")
|
||||
return nil, err
|
||||
@@ -138,6 +142,13 @@ func (ccms *CCMetricStore) doRequest(
|
||||
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
|
||||
}
|
||||
|
||||
// versioning the cc-metric-store query API.
|
||||
// v2 = data with resampling
|
||||
// v1 = data without resampling
|
||||
q := req.URL.Query()
|
||||
q.Add("version", "v2")
|
||||
req.URL.RawQuery = q.Encode()
|
||||
|
||||
res, err := ccms.client.Do(req)
|
||||
if err != nil {
|
||||
log.Error("Error while performing request")
|
||||
@@ -162,8 +173,9 @@ func (ccms *CCMetricStore) LoadData(
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes)
|
||||
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution)
|
||||
if err != nil {
|
||||
log.Warn("Error while building queries")
|
||||
return nil, err
|
||||
@@ -195,11 +207,16 @@ func (ccms *CCMetricStore) LoadData(
|
||||
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
}
|
||||
|
||||
res := mc.Timestep
|
||||
if len(row) > 0 {
|
||||
res = row[0].Resolution
|
||||
}
|
||||
|
||||
jobMetric, ok := jobData[metric][scope]
|
||||
if !ok {
|
||||
jobMetric = &schema.JobMetric{
|
||||
Unit: mc.Unit,
|
||||
Timestep: mc.Timestep,
|
||||
Timestep: res,
|
||||
Series: make([]schema.Series, 0),
|
||||
}
|
||||
jobData[metric][scope] = jobMetric
|
||||
@@ -219,8 +236,7 @@ func (ccms *CCMetricStore) LoadData(
|
||||
}
|
||||
|
||||
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
|
||||
// TODO: use schema.Float instead of float64?
|
||||
// This is done because regular float64 can not be JSONed when NaN.
|
||||
// "schema.Float()" because regular float64 can not be JSONed when NaN.
|
||||
res.Avg = schema.Float(0)
|
||||
res.Min = schema.Float(0)
|
||||
res.Max = schema.Float(0)
|
||||
@@ -251,7 +267,6 @@ func (ccms *CCMetricStore) LoadData(
|
||||
/* Returns list for "partial errors" */
|
||||
return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
|
||||
}
|
||||
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
@@ -267,6 +282,7 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) ([]ApiQuery, []schema.MetricScope, error) {
|
||||
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
|
||||
assignedScope := []schema.MetricScope{}
|
||||
@@ -318,11 +334,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
}
|
||||
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &acceleratorString,
|
||||
TypeIds: host.Accelerators,
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &acceleratorString,
|
||||
TypeIds: host.Accelerators,
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
|
||||
continue
|
||||
@@ -335,11 +352,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
}
|
||||
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &acceleratorString,
|
||||
TypeIds: host.Accelerators,
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &acceleratorString,
|
||||
TypeIds: host.Accelerators,
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
@@ -348,11 +366,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
// HWThread -> HWThead
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(hwthreads),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(hwthreads),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
@@ -363,11 +382,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
||||
for _, core := range cores {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Core[core]),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Core[core]),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
}
|
||||
@@ -379,11 +399,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
||||
for _, socket := range sockets {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Socket[socket]),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Socket[socket]),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
}
|
||||
@@ -393,11 +414,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
// HWThread -> Node
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(hwthreads),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(hwthreads),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
@@ -407,25 +429,44 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
|
||||
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(cores),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(cores),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Core -> Socket
|
||||
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket {
|
||||
sockets, _ := topology.GetSocketsFromCores(hwthreads)
|
||||
for _, socket := range sockets {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(topology.Socket[socket]),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Core -> Node
|
||||
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
|
||||
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(cores),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(cores),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
@@ -435,11 +476,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
|
||||
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &memoryDomainString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &memoryDomainString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
@@ -449,11 +491,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
|
||||
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &memoryDomainString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &memoryDomainString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
@@ -463,11 +506,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
|
||||
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &socketString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &socketString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
@@ -477,11 +521,12 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
|
||||
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &socketString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &socketString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
@@ -490,8 +535,9 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
// Node -> Node
|
||||
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
@@ -510,7 +556,15 @@ func (ccms *CCMetricStore) LoadStats(
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}) // #166 Add scope shere for analysis view accelerator normalization?
|
||||
|
||||
// metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
// resolution := 9000
|
||||
|
||||
// for _, mc := range metricConfigs {
|
||||
// resolution = min(resolution, mc.Timestep)
|
||||
// }
|
||||
|
||||
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization?
|
||||
if err != nil {
|
||||
log.Warn("Error while building query")
|
||||
return nil, err
|
||||
@@ -588,8 +642,9 @@ func (ccms *CCMetricStore) LoadNodeData(
|
||||
for _, node := range nodes {
|
||||
for _, metric := range metrics {
|
||||
req.Queries = append(req.Queries, ApiQuery{
|
||||
Hostname: node,
|
||||
Metric: ccms.toRemoteName(metric),
|
||||
Hostname: node,
|
||||
Metric: ccms.toRemoteName(metric),
|
||||
Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -597,7 +652,7 @@ func (ccms *CCMetricStore) LoadNodeData(
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
log.Error("Error while performing request")
|
||||
log.Error(fmt.Sprintf("Error while performing request %#v\n", err))
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -655,6 +710,462 @@ func (ccms *CCMetricStore) LoadNodeData(
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) LoadNodeListData(
|
||||
cluster, subCluster, nodeFilter string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
page *model.PageRequest,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, int, bool, error) {
|
||||
|
||||
// 0) Init additional vars
|
||||
var totalNodes int = 0
|
||||
var hasNextPage bool = false
|
||||
|
||||
// 1) Get list of all nodes
|
||||
var nodes []string
|
||||
if subCluster != "" {
|
||||
scNodes := archive.NodeLists[cluster][subCluster]
|
||||
nodes = scNodes.PrintList()
|
||||
} else {
|
||||
subClusterNodeLists := archive.NodeLists[cluster]
|
||||
for _, nodeList := range subClusterNodeLists {
|
||||
nodes = append(nodes, nodeList.PrintList()...)
|
||||
}
|
||||
}
|
||||
|
||||
// 2) Filter nodes
|
||||
if nodeFilter != "" {
|
||||
filteredNodes := []string{}
|
||||
for _, node := range nodes {
|
||||
if strings.Contains(node, nodeFilter) {
|
||||
filteredNodes = append(filteredNodes, node)
|
||||
}
|
||||
}
|
||||
nodes = filteredNodes
|
||||
}
|
||||
|
||||
// 2.1) Count total nodes && Sort nodes -> Sorting invalidated after ccms return ...
|
||||
totalNodes = len(nodes)
|
||||
sort.Strings(nodes)
|
||||
|
||||
// 3) Apply paging
|
||||
if len(nodes) > page.ItemsPerPage {
|
||||
start := (page.Page - 1) * page.ItemsPerPage
|
||||
end := start + page.ItemsPerPage
|
||||
if end > len(nodes) {
|
||||
end = len(nodes)
|
||||
hasNextPage = false
|
||||
} else {
|
||||
hasNextPage = true
|
||||
}
|
||||
nodes = nodes[start:end]
|
||||
}
|
||||
|
||||
// Note: Order of node data is not guaranteed after this point, but contents match page and filter criteria
|
||||
|
||||
queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
|
||||
if err != nil {
|
||||
log.Warn("Error while building queries")
|
||||
return nil, totalNodes, hasNextPage, err
|
||||
}
|
||||
|
||||
req := ApiQueryRequest{
|
||||
Cluster: cluster,
|
||||
Queries: queries,
|
||||
From: from.Unix(),
|
||||
To: to.Unix(),
|
||||
WithStats: true,
|
||||
WithData: true,
|
||||
}
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
log.Error(fmt.Sprintf("Error while performing request %#v\n", err))
|
||||
return nil, totalNodes, hasNextPage, err
|
||||
}
|
||||
|
||||
var errors []string
|
||||
data := make(map[string]schema.JobData)
|
||||
for i, row := range resBody.Results {
|
||||
var query ApiQuery
|
||||
if resBody.Queries != nil {
|
||||
query = resBody.Queries[i]
|
||||
} else {
|
||||
query = req.Queries[i]
|
||||
}
|
||||
// qdata := res[0]
|
||||
metric := ccms.toLocalName(query.Metric)
|
||||
scope := assignedScope[i]
|
||||
mc := archive.GetMetricConfig(cluster, metric)
|
||||
|
||||
res := mc.Timestep
|
||||
if len(row) > 0 {
|
||||
res = row[0].Resolution
|
||||
}
|
||||
|
||||
// Init Nested Map Data Structures If Not Found
|
||||
hostData, ok := data[query.Hostname]
|
||||
if !ok {
|
||||
hostData = make(schema.JobData)
|
||||
data[query.Hostname] = hostData
|
||||
}
|
||||
|
||||
metricData, ok := hostData[metric]
|
||||
if !ok {
|
||||
metricData = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
data[query.Hostname][metric] = metricData
|
||||
}
|
||||
|
||||
scopeData, ok := metricData[scope]
|
||||
if !ok {
|
||||
scopeData = &schema.JobMetric{
|
||||
Unit: mc.Unit,
|
||||
Timestep: res,
|
||||
Series: make([]schema.Series, 0),
|
||||
}
|
||||
data[query.Hostname][metric][scope] = scopeData
|
||||
}
|
||||
|
||||
for ndx, res := range row {
|
||||
if res.Error != nil {
|
||||
/* Build list for "partial errors", if any */
|
||||
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
|
||||
continue
|
||||
}
|
||||
|
||||
id := (*string)(nil)
|
||||
if query.Type != nil {
|
||||
id = new(string)
|
||||
*id = query.TypeIds[ndx]
|
||||
}
|
||||
|
||||
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
|
||||
// "schema.Float()" because regular float64 can not be JSONed when NaN.
|
||||
res.Avg = schema.Float(0)
|
||||
res.Min = schema.Float(0)
|
||||
res.Max = schema.Float(0)
|
||||
}
|
||||
|
||||
scopeData.Series = append(scopeData.Series, schema.Series{
|
||||
Hostname: query.Hostname,
|
||||
Id: id,
|
||||
Statistics: schema.MetricStatistics{
|
||||
Avg: float64(res.Avg),
|
||||
Min: float64(res.Min),
|
||||
Max: float64(res.Max),
|
||||
},
|
||||
Data: res.Data,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if len(errors) != 0 {
|
||||
/* Returns list of "partial errors" */
|
||||
return data, totalNodes, hasNextPage, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
|
||||
}
|
||||
|
||||
return data, totalNodes, hasNextPage, nil
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) buildNodeQueries(
|
||||
cluster string,
|
||||
subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) ([]ApiQuery, []schema.MetricScope, error) {
|
||||
|
||||
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(nodes))
|
||||
assignedScope := []schema.MetricScope{}
|
||||
|
||||
// Get Topol before loop if subCluster given
|
||||
var subClusterTopol *schema.SubCluster
|
||||
var scterr error
|
||||
if subCluster != "" {
|
||||
subClusterTopol, scterr = archive.GetSubCluster(cluster, subCluster)
|
||||
if scterr != nil {
|
||||
// TODO: Log
|
||||
return nil, nil, scterr
|
||||
}
|
||||
}
|
||||
|
||||
for _, metric := range metrics {
|
||||
remoteName := ccms.toRemoteName(metric)
|
||||
mc := archive.GetMetricConfig(cluster, metric)
|
||||
if mc == nil {
|
||||
// return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, cluster)
|
||||
log.Infof("metric '%s' is not specified for cluster '%s'", metric, cluster)
|
||||
continue
|
||||
}
|
||||
|
||||
// Avoid duplicates...
|
||||
handledScopes := make([]schema.MetricScope, 0, 3)
|
||||
|
||||
scopesLoop:
|
||||
for _, requestedScope := range scopes {
|
||||
nativeScope := mc.Scope
|
||||
|
||||
scope := nativeScope.Max(requestedScope)
|
||||
for _, s := range handledScopes {
|
||||
if scope == s {
|
||||
continue scopesLoop
|
||||
}
|
||||
}
|
||||
handledScopes = append(handledScopes, scope)
|
||||
|
||||
for _, hostname := range nodes {
|
||||
|
||||
// If no subCluster given, get it by node
|
||||
if subCluster == "" {
|
||||
subClusterName, scnerr := archive.GetSubClusterByNode(cluster, hostname)
|
||||
if scnerr != nil {
|
||||
return nil, nil, scnerr
|
||||
}
|
||||
subClusterTopol, scterr = archive.GetSubCluster(cluster, subClusterName)
|
||||
if scterr != nil {
|
||||
return nil, nil, scterr
|
||||
}
|
||||
}
|
||||
|
||||
// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
|
||||
// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
|
||||
topology := subClusterTopol.Topology
|
||||
acceleratorIds := topology.GetAcceleratorIDs()
|
||||
|
||||
// Moved check here if metric matches hardware specs
|
||||
if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
|
||||
continue scopesLoop
|
||||
}
|
||||
|
||||
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
|
||||
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
|
||||
if scope != schema.MetricScopeAccelerator {
|
||||
// Skip all other catched cases
|
||||
continue
|
||||
}
|
||||
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: false,
|
||||
Type: &acceleratorString,
|
||||
TypeIds: acceleratorIds,
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
|
||||
continue
|
||||
}
|
||||
|
||||
// Accelerator -> Node
|
||||
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
|
||||
if len(acceleratorIds) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: true,
|
||||
Type: &acceleratorString,
|
||||
TypeIds: acceleratorIds,
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// HWThread -> HWThead
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: false,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Node),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// HWThread -> Core
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
|
||||
cores, _ := topology.GetCoresFromHWThreads(topology.Node)
|
||||
for _, core := range cores {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Core[core]),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// HWThread -> Socket
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
|
||||
sockets, _ := topology.GetSocketsFromHWThreads(topology.Node)
|
||||
for _, socket := range sockets {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Socket[socket]),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// HWThread -> Node
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Node),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Core -> Core
|
||||
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
|
||||
cores, _ := topology.GetCoresFromHWThreads(topology.Node)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: false,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(cores),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Core -> Socket
|
||||
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket {
|
||||
sockets, _ := topology.GetSocketsFromCores(topology.Node)
|
||||
for _, socket := range sockets {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: true,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(topology.Socket[socket]),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Core -> Node
|
||||
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
|
||||
cores, _ := topology.GetCoresFromHWThreads(topology.Node)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: true,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(cores),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// MemoryDomain -> MemoryDomain
|
||||
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
|
||||
sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: false,
|
||||
Type: &memoryDomainString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// MemoryDoman -> Node
|
||||
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
|
||||
sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: true,
|
||||
Type: &memoryDomainString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Socket -> Socket
|
||||
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
|
||||
sockets, _ := topology.GetSocketsFromHWThreads(topology.Node)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: false,
|
||||
Type: &socketString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Socket -> Node
|
||||
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
|
||||
sockets, _ := topology.GetSocketsFromHWThreads(topology.Node)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Aggregate: true,
|
||||
Type: &socketString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Node -> Node
|
||||
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: hostname,
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return queries, assignedScope, nil
|
||||
}
|
||||
|
||||
func intToStringSlice(is []int) []string {
|
||||
ss := make([]string, len(is))
|
||||
for i, x := range is {
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
@@ -60,7 +61,8 @@ func (idb *InfluxDBv2DataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
ctx context.Context,
|
||||
resolution int) (schema.JobData, error) {
|
||||
|
||||
measurementsConds := make([]string, 0, len(metrics))
|
||||
for _, m := range metrics {
|
||||
@@ -311,3 +313,21 @@ func (idb *InfluxDBv2DataRepository) LoadNodeData(
|
||||
|
||||
return nil, errors.New("METRICDATA/INFLUXV2 > unimplemented for InfluxDBv2DataRepository")
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadNodeListData(
|
||||
cluster, subCluster, nodeFilter string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
page *model.PageRequest,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, int, bool, error) {
|
||||
|
||||
var totalNodes int = 0
|
||||
var hasNextPage bool = false
|
||||
// TODO : Implement to be used in NodeList-View
|
||||
log.Infof("LoadNodeListData unimplemented for InfluxDBv2DataRepository, Args: cluster %s, metrics %v, nodeFilter %v, scopes %v", cluster, metrics, nodeFilter, scopes)
|
||||
|
||||
return nil, totalNodes, hasNextPage, errors.New("METRICDATA/INFLUXV2 > unimplemented for InfluxDBv2DataRepository")
|
||||
}
|
||||
|
||||
@@ -8,13 +8,11 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
@@ -24,21 +22,21 @@ type MetricDataRepository interface {
|
||||
Init(rawConfig json.RawMessage) error
|
||||
|
||||
// Return the JobData for the given job, only with the requested metrics.
|
||||
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
|
||||
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
|
||||
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
|
||||
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics at the requested scopes for that node.
|
||||
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
|
||||
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
|
||||
LoadNodeListData(cluster, subCluster, nodeFilter string, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, page *model.PageRequest, ctx context.Context) (map[string]schema.JobData, int, bool, error)
|
||||
}
|
||||
|
||||
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
|
||||
|
||||
var useArchive bool
|
||||
|
||||
func Init(disableArchive bool) error {
|
||||
useArchive = !disableArchive
|
||||
func Init() error {
|
||||
for _, cluster := range config.Keys.Clusters {
|
||||
if cluster.MetricDataRepository != nil {
|
||||
var kind struct {
|
||||
@@ -73,284 +71,13 @@ func Init(disableArchive bool) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
// Fetches the metric data for a job.
|
||||
func LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
||||
!useArchive {
|
||||
|
||||
repo, ok := metricDataRepos[job.Cluster]
|
||||
|
||||
if !ok {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
|
||||
}
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := archive.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = repo.LoadData(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
log.Warnf("partial error: %s", err.Error())
|
||||
// return err, 0, 0 // Reactivating will block archiving on one partial error
|
||||
} else {
|
||||
log.Error("Error while loading job data from metric repository")
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
jd, err = archive.GetHandle().LoadJobData(job)
|
||||
if err != nil {
|
||||
log.Error("Error while loading job data from archive")
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
// Avoid sending unrequested data to the client:
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = jd.Size()
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
prepareJobData(job, jd, scopes)
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
log.Error("Error in returned dataset")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
||||
func LoadAverages(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
data [][]schema.Float,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
if job.State != schema.JobStateRunning && useArchive {
|
||||
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
|
||||
}
|
||||
|
||||
repo, ok := metricDataRepos[job.Cluster]
|
||||
if !ok {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
|
||||
if err != nil {
|
||||
log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Used for the node/system view. Returns a map of nodes to a map of metrics.
|
||||
func LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
|
||||
var err error
|
||||
repo, ok := metricDataRepos[cluster]
|
||||
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
log.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
log.Error("Error while loading node data from metric repository")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func cacheKey(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
) string {
|
||||
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
|
||||
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]",
|
||||
job.ID, job.State, metrics, scopes)
|
||||
}
|
||||
|
||||
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
|
||||
// to be available at the scope 'node'. If a job has a lot of nodes,
|
||||
// statisticsSeries should be available so that a min/mean/max Graph can be
|
||||
// used instead of a lot of single lines.
|
||||
func prepareJobData(
|
||||
job *schema.Job,
|
||||
jobData schema.JobData,
|
||||
scopes []schema.MetricScope,
|
||||
) {
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jobData {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jobData.AddNodeScope("flops_any")
|
||||
jobData.AddNodeScope("mem_bw")
|
||||
}
|
||||
}
|
||||
|
||||
// Writes a running job to the job-archive
|
||||
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
allMetrics := make([]string, 0)
|
||||
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
||||
for _, mc := range metricConfigs {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
// TODO: Talk about this! What resolutions to store data at...
|
||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||
if job.NumNodes <= 8 {
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
}
|
||||
|
||||
jobData, err := LoadData(job, allMetrics, scopes, ctx)
|
||||
if err != nil {
|
||||
log.Error("Error wile loading job data for archiving")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobMeta := &schema.JobMeta{
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
Statistics: make(map[string]schema.JobStatistics),
|
||||
}
|
||||
|
||||
for metric, data := range jobData {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// TODO/FIXME: Calc average for non-node metrics as well!
|
||||
continue
|
||||
}
|
||||
|
||||
for _, series := range nodeData.Series {
|
||||
avg += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
}
|
||||
|
||||
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: schema.Unit{
|
||||
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||
},
|
||||
Avg: avg / float64(job.NumNodes),
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
|
||||
// If the file based archive is disabled,
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if !useArchive {
|
||||
return jobMeta, nil
|
||||
}
|
||||
|
||||
return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData)
|
||||
return repo, err
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ import (
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
@@ -166,10 +167,10 @@ func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
|
||||
var rt http.RoundTripper = nil
|
||||
if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
|
||||
prom_pw := promcfg.Secret(prom_pw)
|
||||
rt = promcfg.NewBasicAuthRoundTripper(config.Username, prom_pw, "", promapi.DefaultRoundTripper)
|
||||
rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
|
||||
} else {
|
||||
if config.Username != "" {
|
||||
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set.")
|
||||
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
|
||||
}
|
||||
}
|
||||
// init client
|
||||
@@ -204,8 +205,8 @@ func (pdb *PrometheusDataRepository) FormatQuery(
|
||||
metric string,
|
||||
scope schema.MetricScope,
|
||||
nodes []string,
|
||||
cluster string) (string, error) {
|
||||
|
||||
cluster string,
|
||||
) (string, error) {
|
||||
args := PromQLArgs{}
|
||||
if len(nodes) > 0 {
|
||||
args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
|
||||
@@ -233,12 +234,13 @@ func (pdb *PrometheusDataRepository) RowToSeries(
|
||||
from time.Time,
|
||||
step int64,
|
||||
steps int64,
|
||||
row *promm.SampleStream) schema.Series {
|
||||
row *promm.SampleStream,
|
||||
) schema.Series {
|
||||
ts := from.Unix()
|
||||
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
|
||||
// init array of expected length with NaN
|
||||
values := make([]schema.Float, steps+1)
|
||||
for i, _ := range values {
|
||||
for i := range values {
|
||||
values[i] = schema.NaN
|
||||
}
|
||||
// copy recorded values from prom sample pair
|
||||
@@ -263,8 +265,9 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
// TODO respect requested scope
|
||||
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
@@ -306,7 +309,6 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
@@ -335,7 +337,7 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
pdb.RowToSeries(from, step, steps, row))
|
||||
}
|
||||
// only add metric if at least one host returned data
|
||||
if !ok && len(jobMetric.Series) > 0{
|
||||
if !ok && len(jobMetric.Series) > 0 {
|
||||
jobData[metric][scope] = jobMetric
|
||||
}
|
||||
// sort by hostname to get uniform coloring
|
||||
@@ -351,12 +353,12 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
func (pdb *PrometheusDataRepository) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
|
||||
ctx context.Context,
|
||||
) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
// map of metrics of nodes of stats
|
||||
stats := map[string]map[string]schema.MetricStatistics{}
|
||||
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx)
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading job for stats")
|
||||
return nil, err
|
||||
@@ -376,7 +378,8 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
t0 := time.Now()
|
||||
// Map of hosts of metrics of value slices
|
||||
data := make(map[string]map[string][]*schema.JobMetric)
|
||||
@@ -411,7 +414,6 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
@@ -445,3 +447,21 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
log.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func (pdb *PrometheusDataRepository) LoadNodeListData(
|
||||
cluster, subCluster, nodeFilter string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
page *model.PageRequest,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, int, bool, error) {
|
||||
|
||||
var totalNodes int = 0
|
||||
var hasNextPage bool = false
|
||||
// TODO : Implement to be used in NodeList-View
|
||||
log.Infof("LoadNodeListData unimplemented for PrometheusDataRepository, Args: cluster %s, metrics %v, nodeFilter %v, scopes %v", cluster, metrics, nodeFilter, scopes)
|
||||
|
||||
return nil, totalNodes, hasNextPage, errors.New("METRICDATA/INFLUXV2 > unimplemented for PrometheusDataRepository")
|
||||
}
|
||||
|
||||
@@ -9,10 +9,11 @@ import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
@@ -27,9 +28,10 @@ func (tmdr *TestMetricDataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
ctx context.Context,
|
||||
resolution int) (schema.JobData, error) {
|
||||
|
||||
return TestLoadDataCallback(job, metrics, scopes, ctx)
|
||||
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadStats(
|
||||
@@ -48,3 +50,62 @@ func (tmdr *TestMetricDataRepository) LoadNodeData(
|
||||
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadNodeListData(
|
||||
cluster, subCluster, nodeFilter string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
page *model.PageRequest,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, int, bool, error) {
|
||||
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func DeepCopy(jd_temp schema.JobData) schema.JobData {
|
||||
var jd schema.JobData
|
||||
|
||||
jd = make(schema.JobData, len(jd_temp))
|
||||
for k, v := range jd_temp {
|
||||
jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jd_temp[k]))
|
||||
for k_, v_ := range v {
|
||||
jd[k][k_] = new(schema.JobMetric)
|
||||
jd[k][k_].Series = make([]schema.Series, len(v_.Series))
|
||||
for i := 0; i < len(v_.Series); i += 1 {
|
||||
jd[k][k_].Series[i].Data = make([]schema.Float, len(v_.Series[i].Data))
|
||||
copy(jd[k][k_].Series[i].Data, v_.Series[i].Data)
|
||||
jd[k][k_].Series[i].Hostname = v_.Series[i].Hostname
|
||||
jd[k][k_].Series[i].Id = v_.Series[i].Id
|
||||
jd[k][k_].Series[i].Statistics.Avg = v_.Series[i].Statistics.Avg
|
||||
jd[k][k_].Series[i].Statistics.Min = v_.Series[i].Statistics.Min
|
||||
jd[k][k_].Series[i].Statistics.Max = v_.Series[i].Statistics.Max
|
||||
}
|
||||
jd[k][k_].Timestep = v_.Timestep
|
||||
jd[k][k_].Unit.Base = v_.Unit.Base
|
||||
jd[k][k_].Unit.Prefix = v_.Unit.Prefix
|
||||
if v_.StatisticsSeries != nil {
|
||||
// Init Slices
|
||||
jd[k][k_].StatisticsSeries = new(schema.StatsSeries)
|
||||
jd[k][k_].StatisticsSeries.Max = make([]schema.Float, len(v_.StatisticsSeries.Max))
|
||||
jd[k][k_].StatisticsSeries.Min = make([]schema.Float, len(v_.StatisticsSeries.Min))
|
||||
jd[k][k_].StatisticsSeries.Median = make([]schema.Float, len(v_.StatisticsSeries.Median))
|
||||
jd[k][k_].StatisticsSeries.Mean = make([]schema.Float, len(v_.StatisticsSeries.Mean))
|
||||
// Copy Data
|
||||
copy(jd[k][k_].StatisticsSeries.Max, v_.StatisticsSeries.Max)
|
||||
copy(jd[k][k_].StatisticsSeries.Min, v_.StatisticsSeries.Min)
|
||||
copy(jd[k][k_].StatisticsSeries.Median, v_.StatisticsSeries.Median)
|
||||
copy(jd[k][k_].StatisticsSeries.Mean, v_.StatisticsSeries.Mean)
|
||||
// Handle Percentiles
|
||||
for k__, v__ := range v_.StatisticsSeries.Percentiles {
|
||||
jd[k][k_].StatisticsSeries.Percentiles[k__] = make([]schema.Float, len(v__))
|
||||
copy(jd[k][k_].StatisticsSeries.Percentiles[k__], v__)
|
||||
}
|
||||
} else {
|
||||
jd[k][k_].StatisticsSeries = v_.StatisticsSeries
|
||||
}
|
||||
}
|
||||
}
|
||||
return jd
|
||||
}
|
||||
|
||||
@@ -5,17 +5,16 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
|
||||
@@ -30,12 +29,10 @@ var (
|
||||
)
|
||||
|
||||
type JobRepository struct {
|
||||
DB *sqlx.DB
|
||||
stmtCache *sq.StmtCache
|
||||
cache *lrucache.Cache
|
||||
archiveChannel chan *schema.Job
|
||||
driver string
|
||||
archivePending sync.WaitGroup
|
||||
DB *sqlx.DB
|
||||
stmtCache *sq.StmtCache
|
||||
cache *lrucache.Cache
|
||||
driver string
|
||||
}
|
||||
|
||||
func GetJobRepository() *JobRepository {
|
||||
@@ -46,47 +43,48 @@ func GetJobRepository() *JobRepository {
|
||||
DB: db.DB,
|
||||
driver: db.Driver,
|
||||
|
||||
stmtCache: sq.NewStmtCache(db.DB),
|
||||
cache: lrucache.New(1024 * 1024),
|
||||
archiveChannel: make(chan *schema.Job, 128),
|
||||
stmtCache: sq.NewStmtCache(db.DB),
|
||||
cache: lrucache.New(1024 * 1024),
|
||||
}
|
||||
// start archiving worker
|
||||
go jobRepoInstance.archivingWorker()
|
||||
})
|
||||
return jobRepoInstance
|
||||
}
|
||||
|
||||
var jobColumns []string = []string{
|
||||
"job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.partition", "job.array_job_id",
|
||||
"job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id",
|
||||
"job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state",
|
||||
"job.duration", "job.walltime", "job.resources", "job.mem_used_max", "job.flops_any_avg", "job.mem_bw_avg", "job.load_avg", // "job.meta_data",
|
||||
"job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy",
|
||||
}
|
||||
|
||||
func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
|
||||
job := &schema.Job{}
|
||||
|
||||
if err := row.Scan(
|
||||
&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId,
|
||||
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State,
|
||||
&job.Duration, &job.Walltime, &job.RawResources, &job.MemUsedMax, &job.FlopsAnyAvg, &job.MemBwAvg, &job.LoadAvg /*&job.RawMetaData*/); err != nil {
|
||||
&job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint, &job.Energy); err != nil {
|
||||
log.Warnf("Error while scanning rows (Job): %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
|
||||
log.Warn("Error while unmarhsaling raw resources json")
|
||||
log.Warn("Error while unmarshaling raw resources json")
|
||||
return nil, err
|
||||
}
|
||||
job.RawResources = nil
|
||||
|
||||
// if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
|
||||
// return nil, err
|
||||
// }
|
||||
if err := json.Unmarshal(job.RawFootprint, &job.Footprint); err != nil {
|
||||
log.Warnf("Error while unmarshaling raw footprint json: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
job.RawFootprint = nil
|
||||
|
||||
job.StartTime = time.Unix(job.StartTimeUnix, 0)
|
||||
if job.Duration == 0 && job.State == schema.JobStateRunning {
|
||||
// Always ensure accurate duration for running jobs
|
||||
if job.State == schema.JobStateRunning {
|
||||
job.Duration = int32(time.Since(job.StartTime).Seconds())
|
||||
}
|
||||
|
||||
job.RawResources = nil
|
||||
return job, nil
|
||||
}
|
||||
|
||||
@@ -205,7 +203,10 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err = sq.Update("job").Set("meta_data", job.RawMetaData).Where("job.id = ?", job.ID).RunWith(r.stmtCache).Exec(); err != nil {
|
||||
if _, err = sq.Update("job").
|
||||
Set("meta_data", job.RawMetaData).
|
||||
Where("job.id = ?", job.ID).
|
||||
RunWith(r.stmtCache).Exec(); err != nil {
|
||||
log.Warnf("Error while updating metadata for job, DB ID '%v'", job.ID)
|
||||
return err
|
||||
}
|
||||
@@ -214,222 +215,54 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
|
||||
return archive.UpdateMetadata(job, job.MetaData)
|
||||
}
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) Find(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) (*schema.Job, error) {
|
||||
func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
log.Debugf("Timer Find %s", time.Since(start))
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindAll(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) ([]*schema.Job, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Error("Error while running query")
|
||||
if err := sq.Select("job.footprint").From("job").Where("job.id = ?", job.ID).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&job.RawFootprint); err != nil {
|
||||
log.Warn("Error while scanning for job footprint")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 10)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
log.Debugf("Timer FindAll %s", time.Since(start))
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// FindById executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindById(jobId int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindConcurrentJobs(
|
||||
ctx context.Context,
|
||||
job *schema.Job,
|
||||
) (*model.JobLinkResultList, error) {
|
||||
if job == nil {
|
||||
if len(job.RawFootprint) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("job.id", "job.job_id", "job.start_time").From("job"))
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
query = query.Where("cluster = ?", job.Cluster)
|
||||
var startTime int64
|
||||
var stopTime int64
|
||||
|
||||
startTime = job.StartTimeUnix
|
||||
hostname := job.Resources[0].Hostname
|
||||
|
||||
if job.State == schema.JobStateRunning {
|
||||
stopTime = time.Now().Unix()
|
||||
} else {
|
||||
stopTime = startTime + int64(job.Duration)
|
||||
}
|
||||
|
||||
// Add 200s overlap for jobs start time at the end
|
||||
startTimeTail := startTime + 10
|
||||
stopTimeTail := stopTime - 200
|
||||
startTimeFront := startTime + 200
|
||||
|
||||
queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTime)
|
||||
queryRunning = queryRunning.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%"))
|
||||
|
||||
query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
|
||||
query = query.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%"))
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Errorf("Error while running query: %v", err)
|
||||
if err := json.Unmarshal(job.RawFootprint, &job.Footprint); err != nil {
|
||||
log.Warn("Error while unmarshaling raw footprint json")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
items := make([]*model.JobLink, 0, 10)
|
||||
queryString := fmt.Sprintf("cluster=%s", job.Cluster)
|
||||
log.Debugf("Timer FetchFootprint %s", time.Since(start))
|
||||
return job.Footprint, nil
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
|
||||
if err = rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
})
|
||||
}
|
||||
func (r *JobRepository) FetchEnergyFootprint(job *schema.Job) (map[string]float64, error) {
|
||||
start := time.Now()
|
||||
cachekey := fmt.Sprintf("energyFootprint:%d", job.ID)
|
||||
if cached := r.cache.Get(cachekey, nil); cached != nil {
|
||||
job.EnergyFootprint = cached.(map[string]float64)
|
||||
return job.EnergyFootprint, nil
|
||||
}
|
||||
|
||||
rows, err = queryRunning.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Errorf("Error while running query: %v", err)
|
||||
if err := sq.Select("job.energy_footprint").From("job").Where("job.id = ?", job.ID).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&job.RawEnergyFootprint); err != nil {
|
||||
log.Warn("Error while scanning for job energy_footprint")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
|
||||
if err := rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
})
|
||||
}
|
||||
if len(job.RawEnergyFootprint) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
cnt := len(items)
|
||||
|
||||
return &model.JobLinkResultList{
|
||||
ListQuery: &queryString,
|
||||
Items: items,
|
||||
Count: &cnt,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Start inserts a new job in the table, returning the unique job ID.
|
||||
// Statistics are not transfered!
|
||||
func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) {
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
|
||||
if err := json.Unmarshal(job.RawEnergyFootprint, &job.EnergyFootprint); err != nil {
|
||||
log.Warn("Error while unmarshaling raw energy footprint json")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
|
||||
}
|
||||
|
||||
res, err := r.DB.NamedExec(`INSERT INTO job (
|
||||
job_id, user, project, cluster, subcluster, `+"`partition`"+`, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data
|
||||
) VALUES (
|
||||
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data
|
||||
);`, job)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
return res.LastInsertId()
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) Stop(
|
||||
jobId int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32,
|
||||
) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
r.cache.Put(cachekey, job.EnergyFootprint, len(job.EnergyFootprint), 24*time.Hour)
|
||||
log.Debugf("Timer FetchEnergyFootprint %s", time.Since(start))
|
||||
return job.EnergyFootprint, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) DeleteJobsBefore(startTime int64) (int, error) {
|
||||
@@ -461,119 +294,22 @@ func (r *JobRepository) DeleteJobById(id int64) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", job)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) MarkArchived(
|
||||
jobId int64,
|
||||
monitoringStatus int32,
|
||||
metricStats map[string]schema.JobStatistics,
|
||||
) error {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
|
||||
for metric, stats := range metricStats {
|
||||
switch metric {
|
||||
case "flops_any":
|
||||
stmt = stmt.Set("flops_any_avg", stats.Avg)
|
||||
case "mem_used":
|
||||
stmt = stmt.Set("mem_used_max", stats.Max)
|
||||
case "mem_bw":
|
||||
stmt = stmt.Set("mem_bw_avg", stats.Avg)
|
||||
case "load":
|
||||
stmt = stmt.Set("load_avg", stats.Avg)
|
||||
case "cpu_load":
|
||||
stmt = stmt.Set("load_avg", stats.Avg)
|
||||
case "net_bw":
|
||||
stmt = stmt.Set("net_bw_avg", stats.Avg)
|
||||
case "file_bw":
|
||||
stmt = stmt.Set("file_bw_avg", stats.Avg)
|
||||
default:
|
||||
log.Debugf("MarkArchived() Metric '%v' unknown", metric)
|
||||
}
|
||||
}
|
||||
|
||||
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
log.Warn("Error while marking job as archived")
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Archiving worker thread
|
||||
func (r *JobRepository) archivingWorker() {
|
||||
for {
|
||||
select {
|
||||
case job, ok := <-r.archiveChannel:
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
start := time.Now()
|
||||
// not using meta data, called to load JobMeta into Cache?
|
||||
// will fail if job meta not in repository
|
||||
if _, err := r.FetchMetadata(job); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
||||
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// metricdata.ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
|
||||
// TODO: Maybe use context with cancel/timeout here
|
||||
jobMeta, err := metricdata.ArchiveJob(job, context.Background())
|
||||
if err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
||||
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
continue
|
||||
}
|
||||
|
||||
// Update the jobs database entry one last time:
|
||||
if err := r.MarkArchived(job.ID, schema.MonitoringStatusArchivingSuccessful, jobMeta.Statistics); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed at marking archived step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
log.Printf("archiving job (dbid: %d) successful", job.ID)
|
||||
r.archivePending.Done()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Trigger async archiving
|
||||
func (r *JobRepository) TriggerArchiving(job *schema.Job) {
|
||||
r.archivePending.Add(1)
|
||||
r.archiveChannel <- job
|
||||
}
|
||||
|
||||
// Wait for background thread to finish pending archiving operations
|
||||
func (r *JobRepository) WaitForArchiving() {
|
||||
// close channel and wait for worker to process remaining jobs
|
||||
r.archivePending.Wait()
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindUserOrProjectOrJobname(user *schema.User, searchterm string) (jobid string, username string, project string, jobname string) {
|
||||
if _, err := strconv.Atoi(searchterm); err == nil { // Return empty on successful conversion: parent method will redirect for integer jobId
|
||||
return searchterm, "", "", ""
|
||||
} else { // Has to have letters and logged-in user for other guesses
|
||||
if user != nil {
|
||||
// Find username in jobs (match)
|
||||
uresult, _ := r.FindColumnValue(user, searchterm, "job", "user", "user", false)
|
||||
// Find username by username in job table (match)
|
||||
uresult, _ := r.FindColumnValue(user, searchterm, "job", "hpc_user", "hpc_user", false)
|
||||
if uresult != "" {
|
||||
return "", uresult, "", ""
|
||||
}
|
||||
// Find username by name (like)
|
||||
nresult, _ := r.FindColumnValue(user, searchterm, "user", "username", "name", true)
|
||||
// Find username by real name in hpc_user table (like)
|
||||
nresult, _ := r.FindColumnValue(user, searchterm, "hpc_user", "username", "name", true)
|
||||
if nresult != "" {
|
||||
return "", nresult, "", ""
|
||||
}
|
||||
// Find projectId in jobs (match)
|
||||
// Find projectId by projectId in job table (match)
|
||||
presult, _ := r.FindColumnValue(user, searchterm, "job", "project", "project", false)
|
||||
if presult != "" {
|
||||
return "", "", presult, ""
|
||||
@@ -655,7 +391,7 @@ func (r *JobRepository) Partitions(cluster string) ([]string, error) {
|
||||
start := time.Now()
|
||||
partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) {
|
||||
parts := []string{}
|
||||
if err = r.DB.Select(&parts, `SELECT DISTINCT job.partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
|
||||
if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
|
||||
return nil, 0, 1000
|
||||
}
|
||||
|
||||
@@ -712,6 +448,7 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
|
||||
return subclusters, nil
|
||||
}
|
||||
|
||||
// FIXME: Set duration to requested walltime?
|
||||
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
||||
start := time.Now()
|
||||
res, err := sq.Update("job").
|
||||
@@ -740,6 +477,46 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) {
|
||||
query := sq.Select(jobColumns...).From("job").
|
||||
Where(fmt.Sprintf("job.cluster = '%s'", cluster)).
|
||||
Where("job.job_state = 'running'").
|
||||
Where("job.duration > 600")
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 50)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
log.Infof("Return job count %d", len(jobs))
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateDuration() error {
|
||||
stmnt := sq.Update("job").
|
||||
Set("duration", sq.Expr("? - job.start_time", time.Now().Unix())).
|
||||
Where("job_state = 'running'")
|
||||
|
||||
_, err := stmnt.RunWith(r.stmtCache).Exec()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) {
|
||||
var query sq.SelectBuilder
|
||||
|
||||
@@ -778,27 +555,112 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
const NamedJobInsert string = `INSERT INTO job (
|
||||
job_id, user, project, cluster, subcluster, ` + "`partition`" + `, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data,
|
||||
mem_used_max, flops_any_avg, mem_bw_avg, load_avg, net_bw_avg, net_data_vol_total, file_bw_avg, file_data_vol_total
|
||||
) VALUES (
|
||||
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data,
|
||||
:mem_used_max, :flops_any_avg, :mem_bw_avg, :load_avg, :net_bw_avg, :net_data_vol_total, :file_bw_avg, :file_data_vol_total
|
||||
);`
|
||||
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", job)
|
||||
|
||||
func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {
|
||||
res, err := r.DB.NamedExec(NamedJobInsert, job)
|
||||
if err != nil {
|
||||
log.Warn("Error while NamedJobInsert")
|
||||
return 0, err
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
log.Warn("Error while getting last insert ID")
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return id, nil
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
}
|
||||
|
||||
func (r *JobRepository) Execute(stmt sq.UpdateBuilder) error {
|
||||
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) MarkArchived(
|
||||
stmt sq.UpdateBuilder,
|
||||
monitoringStatus int32,
|
||||
) sq.UpdateBuilder {
|
||||
return stmt.Set("monitoring_status", monitoringStatus)
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateEnergy(
|
||||
stmt sq.UpdateBuilder,
|
||||
jobMeta *schema.JobMeta,
|
||||
) (sq.UpdateBuilder, error) {
|
||||
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
|
||||
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
|
||||
if err != nil {
|
||||
log.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return stmt, err
|
||||
}
|
||||
energyFootprint := make(map[string]float64)
|
||||
var totalEnergy float64
|
||||
var energy float64
|
||||
|
||||
for _, fp := range sc.EnergyFootprint {
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
|
||||
// Note: For DB data, calculate and save as kWh
|
||||
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules or Wh)
|
||||
// FIXME: Needs sum as stats type
|
||||
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
|
||||
// Energy: Power (in Watts) * Time (in Seconds)
|
||||
// Unit: (( W * s ) / 3600) / 1000 = kWh ; Rounded to 2 nearest digits: (Energy * 100) / 100
|
||||
// Here: All-Node Metric Average * Number of Nodes * Job Runtime
|
||||
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
|
||||
metricNodeSum := LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes) * float64(jobMeta.Duration)
|
||||
energy = math.Round(((metricNodeSum/3600)/1000)*100) / 100
|
||||
}
|
||||
} else {
|
||||
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)
|
||||
}
|
||||
|
||||
energyFootprint[fp] = energy
|
||||
totalEnergy += energy
|
||||
}
|
||||
|
||||
var rawFootprint []byte
|
||||
if rawFootprint, err = json.Marshal(energyFootprint); err != nil {
|
||||
log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
|
||||
return stmt, err
|
||||
}
|
||||
|
||||
return stmt.Set("energy_footprint", string(rawFootprint)).Set("energy", (math.Round(totalEnergy*100) / 100)), nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateFootprint(
|
||||
stmt sq.UpdateBuilder,
|
||||
jobMeta *schema.JobMeta,
|
||||
) (sq.UpdateBuilder, error) {
|
||||
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
|
||||
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
|
||||
if err != nil {
|
||||
log.Errorf("cannot get subcluster: %s", err.Error())
|
||||
return stmt, err
|
||||
}
|
||||
footprint := make(map[string]float64)
|
||||
|
||||
for _, fp := range sc.Footprint {
|
||||
var statType string
|
||||
for _, gm := range archive.GlobalMetricList {
|
||||
if gm.Name == fp {
|
||||
statType = gm.Footprint
|
||||
}
|
||||
}
|
||||
|
||||
if statType != "avg" && statType != "min" && statType != "max" {
|
||||
log.Warnf("unknown statType for footprint update: %s", statType)
|
||||
return stmt, fmt.Errorf("unknown statType for footprint update: %s", statType)
|
||||
}
|
||||
|
||||
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
|
||||
statType = sc.MetricConfig[i].Footprint
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||
footprint[name] = LoadJobStat(jobMeta, fp, statType)
|
||||
}
|
||||
|
||||
var rawFootprint []byte
|
||||
if rawFootprint, err = json.Marshal(footprint); err != nil {
|
||||
log.Warnf("Error while marshaling footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
|
||||
return stmt, err
|
||||
}
|
||||
|
||||
return stmt.Set("footprint", string(rawFootprint)), nil
|
||||
}
|
||||
|
||||
75
internal/repository/jobCreate.go
Normal file
75
internal/repository/jobCreate.go
Normal file
@@ -0,0 +1,75 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
const NamedJobInsert string = `INSERT INTO job (
|
||||
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
|
||||
) VALUES (
|
||||
:job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
|
||||
);`
|
||||
|
||||
func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) {
|
||||
res, err := r.DB.NamedExec(NamedJobInsert, job)
|
||||
if err != nil {
|
||||
log.Warn("Error while NamedJobInsert")
|
||||
return 0, err
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
log.Warn("Error while getting last insert ID")
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return id, nil
|
||||
}
|
||||
|
||||
// Start inserts a new job in the table, returning the unique job ID.
|
||||
// Statistics are not transfered!
|
||||
func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) {
|
||||
job.RawFootprint, err = json.Marshal(job.Footprint)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err)
|
||||
}
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
|
||||
}
|
||||
|
||||
return r.InsertJob(job)
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) Stop(
|
||||
jobId int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32,
|
||||
) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
}
|
||||
261
internal/repository/jobFind.go
Normal file
261
internal/repository/jobFind.go
Normal file
@@ -0,0 +1,261 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) Find(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) (*schema.Job, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
q = q.OrderBy("job.id DESC") // always use newest matching job by db id if more than one match
|
||||
|
||||
log.Debugf("Timer Find %s", time.Since(start))
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindAll(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) ([]*schema.Job, error) {
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 10)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
log.Debugf("Timer FindAll %s", time.Since(start))
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// FindById executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
|
||||
q, qerr := SecurityCheck(ctx, q)
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByIdWithUser executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id. The user is passed directly,
|
||||
// instead as part of the context.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
|
||||
q, qerr := SecurityCheckWithUser(user, q)
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByIdDirect executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByIdDirect(jobId int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByJobId executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the slurm id and the clustername.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime int64, cluster string) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").
|
||||
Where("job.job_id = ?", jobId).
|
||||
Where("job.cluster = ?", cluster).
|
||||
Where("job.start_time = ?", startTime)
|
||||
|
||||
q, qerr := SecurityCheck(ctx, q)
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// IsJobOwner executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the slurm id,a username and the cluster.
|
||||
// It returns a bool.
|
||||
// If job was found, user is owner: test err != sql.ErrNoRows
|
||||
func (r *JobRepository) IsJobOwner(jobId int64, startTime int64, user string, cluster string) bool {
|
||||
q := sq.Select("id").
|
||||
From("job").
|
||||
Where("job.job_id = ?", jobId).
|
||||
Where("job.hpc_user = ?", user).
|
||||
Where("job.cluster = ?", cluster).
|
||||
Where("job.start_time = ?", startTime)
|
||||
|
||||
_, err := scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
return err != sql.ErrNoRows
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindConcurrentJobs(
|
||||
ctx context.Context,
|
||||
job *schema.Job,
|
||||
) (*model.JobLinkResultList, error) {
|
||||
if job == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("job.id", "job.job_id", "job.start_time").From("job"))
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
query = query.Where("cluster = ?", job.Cluster)
|
||||
var startTime int64
|
||||
var stopTime int64
|
||||
|
||||
startTime = job.StartTimeUnix
|
||||
hostname := job.Resources[0].Hostname
|
||||
|
||||
if job.State == schema.JobStateRunning {
|
||||
stopTime = time.Now().Unix()
|
||||
} else {
|
||||
stopTime = startTime + int64(job.Duration)
|
||||
}
|
||||
|
||||
// Add 200s overlap for jobs start time at the end
|
||||
startTimeTail := startTime + 10
|
||||
stopTimeTail := stopTime - 200
|
||||
startTimeFront := startTime + 200
|
||||
|
||||
queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTime)
|
||||
queryRunning = queryRunning.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%"))
|
||||
|
||||
query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
|
||||
query = query.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%"))
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Errorf("Error while running query: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
items := make([]*model.JobLink, 0, 10)
|
||||
queryString := fmt.Sprintf("cluster=%s", job.Cluster)
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
|
||||
if err = rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
rows, err = queryRunning.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Errorf("Error while running query: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
|
||||
if err := rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
cnt := len(items)
|
||||
|
||||
return &model.JobLinkResultList{
|
||||
ListQuery: &queryString,
|
||||
Items: items,
|
||||
Count: &cnt,
|
||||
}, nil
|
||||
}
|
||||
@@ -22,8 +22,8 @@ func (r *JobRepository) QueryJobs(
|
||||
ctx context.Context,
|
||||
filters []*model.JobFilter,
|
||||
page *model.PageRequest,
|
||||
order *model.OrderByInput) ([]*schema.Job, error) {
|
||||
|
||||
order *model.OrderByInput,
|
||||
) ([]*schema.Job, error) {
|
||||
query, qerr := SecurityCheck(ctx, sq.Select(jobColumns...).From("job"))
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
@@ -31,14 +31,28 @@ func (r *JobRepository) QueryJobs(
|
||||
|
||||
if order != nil {
|
||||
field := toSnakeCase(order.Field)
|
||||
|
||||
switch order.Order {
|
||||
case model.SortDirectionEnumAsc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
|
||||
case model.SortDirectionEnumDesc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
|
||||
default:
|
||||
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order")
|
||||
if order.Type == "col" {
|
||||
// "col": Fixed column name query
|
||||
switch order.Order {
|
||||
case model.SortDirectionEnumAsc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
|
||||
case model.SortDirectionEnumDesc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
|
||||
default:
|
||||
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for column")
|
||||
}
|
||||
} else {
|
||||
// "foot": Order by footprint JSON field values
|
||||
// Verify and Search Only in Valid Jsons
|
||||
query = query.Where("JSON_VALID(meta_data)")
|
||||
switch order.Order {
|
||||
case model.SortDirectionEnumAsc:
|
||||
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") ASC", field))
|
||||
case model.SortDirectionEnumDesc:
|
||||
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") DESC", field))
|
||||
default:
|
||||
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for footprint")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,9 +87,10 @@ func (r *JobRepository) QueryJobs(
|
||||
|
||||
func (r *JobRepository) CountJobs(
|
||||
ctx context.Context,
|
||||
filters []*model.JobFilter) (int, error) {
|
||||
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("count(*)").From("job"))
|
||||
filters []*model.JobFilter,
|
||||
) (int, error) {
|
||||
// DISTICT count for tags filters, does not affect other queries
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("count(DISTINCT job.id)").From("job"))
|
||||
if qerr != nil {
|
||||
return 0, qerr
|
||||
}
|
||||
@@ -92,35 +107,43 @@ func (r *JobRepository) CountJobs(
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
|
||||
user := GetUserFromContext(ctx)
|
||||
func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBuilder, error) {
|
||||
if user == nil {
|
||||
var qnil sq.SelectBuilder
|
||||
return qnil, fmt.Errorf("user context is nil")
|
||||
} else if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport, schema.RoleApi}) { // Admin & Co. : All jobs
|
||||
}
|
||||
|
||||
switch {
|
||||
case len(user.Roles) == 1 && user.HasRole(schema.RoleApi): // API-User : All jobs
|
||||
return query, nil
|
||||
} else if user.HasRole(schema.RoleManager) { // Manager : Add filter for managed projects' jobs only + personal jobs
|
||||
case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): // Admin & Support : All jobs
|
||||
return query, nil
|
||||
case user.HasRole(schema.RoleManager): // Manager : Add filter for managed projects' jobs only + personal jobs
|
||||
if len(user.Projects) != 0 {
|
||||
return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.user": user.Username}}), nil
|
||||
return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.hpc_user": user.Username}}), nil
|
||||
} else {
|
||||
log.Debugf("Manager-User '%s' has no defined projects to lookup! Query only personal jobs ...", user.Username)
|
||||
return query.Where("job.user = ?", user.Username), nil
|
||||
return query.Where("job.hpc_user = ?", user.Username), nil
|
||||
}
|
||||
} else if user.HasRole(schema.RoleUser) { // User : Only personal jobs
|
||||
return query.Where("job.user = ?", user.Username), nil
|
||||
} else {
|
||||
// Shortterm compatibility: Return User-Query if no roles:
|
||||
return query.Where("job.user = ?", user.Username), nil
|
||||
// // On the longterm: Return Error instead of fallback:
|
||||
// var qnil sq.SelectBuilder
|
||||
// return qnil, fmt.Errorf("user '%s' with unknown roles [%#v]", user.Username, user.Roles)
|
||||
case user.HasRole(schema.RoleUser): // User : Only personal jobs
|
||||
return query.Where("job.hpc_user = ?", user.Username), nil
|
||||
default: // No known Role, return error
|
||||
var qnil sq.SelectBuilder
|
||||
return qnil, fmt.Errorf("user has no or unknown roles")
|
||||
}
|
||||
}
|
||||
|
||||
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
|
||||
user := GetUserFromContext(ctx)
|
||||
|
||||
return SecurityCheckWithUser(user, query)
|
||||
}
|
||||
|
||||
// Build a sq.SelectBuilder out of a schema.JobFilter.
|
||||
func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if filter.Tags != nil {
|
||||
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags})
|
||||
// This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query?
|
||||
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct()
|
||||
}
|
||||
if filter.JobID != nil {
|
||||
query = buildStringCondition("job.job_id", filter.JobID, query)
|
||||
@@ -129,7 +152,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
||||
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
|
||||
}
|
||||
if filter.User != nil {
|
||||
query = buildStringCondition("job.user", filter.User, query)
|
||||
query = buildStringCondition("job.hpc_user", filter.User, query)
|
||||
}
|
||||
if filter.Project != nil {
|
||||
query = buildStringCondition("job.project", filter.Project, query)
|
||||
@@ -141,19 +164,21 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
||||
query = buildStringCondition("job.cluster", filter.Cluster, query)
|
||||
}
|
||||
if filter.Partition != nil {
|
||||
query = buildStringCondition("job.partition", filter.Partition, query)
|
||||
query = buildStringCondition("job.cluster_partition", filter.Partition, query)
|
||||
}
|
||||
if filter.StartTime != nil {
|
||||
query = buildTimeCondition("job.start_time", filter.StartTime, query)
|
||||
}
|
||||
if filter.Duration != nil {
|
||||
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
||||
query = query.Where("(CASE WHEN job.job_state = 'running' THEN (? - job.start_time) ELSE job.duration END) BETWEEN ? AND ?", now, filter.Duration.From, filter.Duration.To)
|
||||
query = buildIntCondition("job.duration", filter.Duration, query)
|
||||
}
|
||||
if filter.MinRunningFor != nil {
|
||||
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
||||
query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor)
|
||||
}
|
||||
if filter.Exclusive != nil {
|
||||
query = query.Where("job.exclusive = ?", *filter.Exclusive)
|
||||
}
|
||||
if filter.State != nil {
|
||||
states := make([]string, len(filter.State))
|
||||
for i, val := range filter.State {
|
||||
@@ -174,17 +199,13 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
||||
if filter.Node != nil {
|
||||
query = buildStringCondition("job.resources", filter.Node, query)
|
||||
}
|
||||
if filter.FlopsAnyAvg != nil {
|
||||
query = buildFloatCondition("job.flops_any_avg", filter.FlopsAnyAvg, query)
|
||||
if filter.Energy != nil {
|
||||
query = buildFloatCondition("job.energy", filter.Energy, query)
|
||||
}
|
||||
if filter.MemBwAvg != nil {
|
||||
query = buildFloatCondition("job.mem_bw_avg", filter.MemBwAvg, query)
|
||||
}
|
||||
if filter.LoadAvg != nil {
|
||||
query = buildFloatCondition("job.load_avg", filter.LoadAvg, query)
|
||||
}
|
||||
if filter.MemUsedMax != nil {
|
||||
query = buildFloatCondition("job.mem_used_max", filter.MemUsedMax, query)
|
||||
if filter.MetricStats != nil {
|
||||
for _, ms := range filter.MetricStats {
|
||||
query = buildFloatJsonCondition(ms.MetricName, ms.Range, query)
|
||||
}
|
||||
}
|
||||
return query
|
||||
}
|
||||
@@ -193,6 +214,10 @@ func buildIntCondition(field string, cond *schema.IntRange, query sq.SelectBuild
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
}
|
||||
|
||||
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
}
|
||||
|
||||
func buildTimeCondition(field string, cond *schema.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if cond.From != nil && cond.To != nil {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix())
|
||||
@@ -200,13 +225,32 @@ func buildTimeCondition(field string, cond *schema.TimeRange, query sq.SelectBui
|
||||
return query.Where("? <= "+field, cond.From.Unix())
|
||||
} else if cond.To != nil {
|
||||
return query.Where(field+" <= ?", cond.To.Unix())
|
||||
} else if cond.Range != "" {
|
||||
now := time.Now().Unix()
|
||||
var then int64
|
||||
switch cond.Range {
|
||||
case "last6h":
|
||||
then = now - (60 * 60 * 6)
|
||||
case "last24h":
|
||||
then = now - (60 * 60 * 24)
|
||||
case "last7d":
|
||||
then = now - (60 * 60 * 24 * 7)
|
||||
case "last30d":
|
||||
then = now - (60 * 60 * 24 * 30)
|
||||
default:
|
||||
log.Debugf("No known named timeRange: startTime.range = %s", cond.Range)
|
||||
return query
|
||||
}
|
||||
return query.Where(field+" BETWEEN ? AND ?", then, now)
|
||||
} else {
|
||||
return query
|
||||
}
|
||||
}
|
||||
|
||||
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
func buildFloatJsonCondition(condName string, condRange *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
// Verify and Search Only in Valid Jsons
|
||||
query = query.Where("JSON_VALID(footprint)")
|
||||
return query.Where("JSON_EXTRACT(footprint, \"$."+condName+"\") BETWEEN ? AND ?", condRange.From, condRange.To)
|
||||
}
|
||||
|
||||
func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
@@ -227,9 +271,7 @@ func buildStringCondition(field string, cond *model.StringInput, query sq.Select
|
||||
}
|
||||
if cond.In != nil {
|
||||
queryElements := make([]string, len(cond.In))
|
||||
for i, val := range cond.In {
|
||||
queryElements[i] = val
|
||||
}
|
||||
copy(queryElements, cond.In)
|
||||
return query.Where(sq.Or{sq.Eq{field: queryElements}})
|
||||
}
|
||||
return query
|
||||
@@ -257,8 +299,10 @@ func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.
|
||||
return query
|
||||
}
|
||||
|
||||
var matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)")
|
||||
var matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
|
||||
var (
|
||||
matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)")
|
||||
matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
|
||||
)
|
||||
|
||||
func toSnakeCase(str string) string {
|
||||
for _, c := range str {
|
||||
@@ -5,9 +5,11 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
@@ -30,7 +32,7 @@ func TestFind(t *testing.T) {
|
||||
func TestFindById(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
job, err := r.FindById(5)
|
||||
job, err := r.FindById(getContext(t), 5)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -45,7 +47,19 @@ func TestFindById(t *testing.T) {
|
||||
func TestGetTags(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
tags, counts, err := r.CountTags(nil)
|
||||
const contextUserKey ContextKey = "user"
|
||||
contextUserValue := &schema.User{
|
||||
Username: "testuser",
|
||||
Projects: make([]string, 0),
|
||||
Roles: []string{"user"},
|
||||
AuthType: 0,
|
||||
AuthSource: 2,
|
||||
}
|
||||
|
||||
ctx := context.WithValue(getContext(t), contextUserKey, contextUserValue)
|
||||
|
||||
// Test Tag has Scope "global"
|
||||
tags, counts, err := r.CountTags(GetUserFromContext(ctx))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ import (
|
||||
"github.com/golang-migrate/migrate/v4/source/iofs"
|
||||
)
|
||||
|
||||
const Version uint = 7
|
||||
const Version uint = 8
|
||||
|
||||
//go:embed migrations/*
|
||||
var migrationFiles embed.FS
|
||||
@@ -114,6 +114,14 @@ func MigrateDB(backend string, db string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
v, dirty, err := m.Version()
|
||||
|
||||
log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version)
|
||||
|
||||
if dirty {
|
||||
return fmt.Errorf("last migration to version %d has failed, please fix the db manually and force version with -force-db flag", Version)
|
||||
}
|
||||
|
||||
if err := m.Up(); err != nil {
|
||||
if err == migrate.ErrNoChange {
|
||||
log.Info("DB already up to date!")
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
ALTER TABLE job DROP energy;
|
||||
ALTER TABLE job DROP energy_footprint;
|
||||
ALTER TABLE job ADD COLUMN flops_any_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_used_max;
|
||||
ALTER TABLE job ADD COLUMN load_avg;
|
||||
ALTER TABLE job ADD COLUMN net_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN net_data_vol_total;
|
||||
ALTER TABLE job ADD COLUMN file_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN file_data_vol_total;
|
||||
|
||||
UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
|
||||
UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
|
||||
UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
|
||||
UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
|
||||
UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
|
||||
UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
|
||||
UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
|
||||
UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
|
||||
|
||||
ALTER TABLE job DROP footprint;
|
||||
-- Do not use reserved keywords anymore
|
||||
RENAME TABLE hpc_user TO `user`;
|
||||
ALTER TABLE job RENAME COLUMN hpc_user TO `user`;
|
||||
ALTER TABLE job RENAME COLUMN cluster_partition TO `partition`;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_subcluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_project;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_user;
|
||||
DROP INDEX IF EXISTS jobs_user_starttime;
|
||||
DROP INDEX IF EXISTS jobs_user_duration;
|
||||
DROP INDEX IF EXISTS jobs_user_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_project;
|
||||
DROP INDEX IF EXISTS jobs_project_user;
|
||||
DROP INDEX IF EXISTS jobs_project_starttime;
|
||||
DROP INDEX IF EXISTS jobs_project_duration;
|
||||
DROP INDEX IF EXISTS jobs_project_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_arrayjobid_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_starttime;
|
||||
DROP INDEX IF EXISTS jobs_duration;
|
||||
DROP INDEX IF EXISTS jobs_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_duration_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numnodes_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numacc_starttime;
|
||||
DROP INDEX IF EXISTS jobs_energy_starttime;
|
||||
123
internal/repository/migrations/mysql/08_add-footprint.up.sql
Normal file
123
internal/repository/migrations/mysql/08_add-footprint.up.sql
Normal file
@@ -0,0 +1,123 @@
|
||||
DROP INDEX IF EXISTS job_stats ON job;
|
||||
DROP INDEX IF EXISTS job_by_user ON job;
|
||||
DROP INDEX IF EXISTS job_by_starttime ON job;
|
||||
DROP INDEX IF EXISTS job_by_job_id ON job;
|
||||
DROP INDEX IF EXISTS job_list ON job;
|
||||
DROP INDEX IF EXISTS job_list_user ON job;
|
||||
DROP INDEX IF EXISTS job_list_users ON job;
|
||||
DROP INDEX IF EXISTS job_list_users_start ON job;
|
||||
|
||||
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
|
||||
ALTER TABLE job ADD COLUMN energy_footprint JSON;
|
||||
|
||||
ALTER TABLE job ADD COLUMN footprint JSON;
|
||||
ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global';
|
||||
|
||||
-- Do not use reserved keywords anymore
|
||||
RENAME TABLE `user` TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN `user` TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN `partition` TO cluster_partition;
|
||||
|
||||
ALTER TABLE job MODIFY COLUMN cluster VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN hpc_user VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN subcluster VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN project VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN cluster_partition VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN job_state VARCHAR(25);
|
||||
|
||||
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
|
||||
UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
|
||||
|
||||
ALTER TABLE job DROP flops_any_avg;
|
||||
ALTER TABLE job DROP mem_bw_avg;
|
||||
ALTER TABLE job DROP mem_used_max;
|
||||
ALTER TABLE job DROP load_avg;
|
||||
ALTER TABLE job DROP net_bw_avg;
|
||||
ALTER TABLE job DROP net_data_vol_total;
|
||||
ALTER TABLE job DROP file_bw_avg;
|
||||
ALTER TABLE job DROP file_data_vol_total;
|
||||
|
||||
-- Indices for: Single filters, combined filters, sorting, sorting with filters
|
||||
-- Cluster Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
|
||||
-- Cluster Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
|
||||
|
||||
-- Cluster+Partition Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
|
||||
-- Cluster+Partition Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
|
||||
|
||||
-- Cluster+Partition+Jobstate Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
|
||||
-- Cluster+Partition+Jobstate Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
|
||||
|
||||
-- Cluster+JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
|
||||
-- Cluster+JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
|
||||
|
||||
-- User Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
|
||||
-- User Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
|
||||
|
||||
-- Project Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
|
||||
-- Project Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
|
||||
|
||||
-- JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
|
||||
-- JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
|
||||
|
||||
-- ArrayJob Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
|
||||
|
||||
-- Sorting without active filters
|
||||
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
|
||||
|
||||
-- Single filters with default starttime sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
|
||||
|
||||
-- Optimize DB index usage
|
||||
103
internal/repository/migrations/sqlite3/08_add-footprint.down.sql
Normal file
103
internal/repository/migrations/sqlite3/08_add-footprint.down.sql
Normal file
@@ -0,0 +1,103 @@
|
||||
ALTER TABLE job DROP energy;
|
||||
ALTER TABLE job DROP energy_footprint;
|
||||
ALTER TABLE job ADD COLUMN flops_any_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_used_max;
|
||||
ALTER TABLE job ADD COLUMN load_avg;
|
||||
ALTER TABLE job ADD COLUMN net_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN net_data_vol_total;
|
||||
ALTER TABLE job ADD COLUMN file_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN file_data_vol_total;
|
||||
|
||||
UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
|
||||
UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
|
||||
UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
|
||||
UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
|
||||
UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
|
||||
UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
|
||||
UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
|
||||
UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
|
||||
|
||||
ALTER TABLE job DROP footprint;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_subcluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_cluster_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_cluster_numacc;
|
||||
DROP INDEX IF EXISTS jobs_cluster_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_numacc;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numacc;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_project;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_numacc;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_user;
|
||||
DROP INDEX IF EXISTS jobs_user_starttime;
|
||||
DROP INDEX IF EXISTS jobs_user_duration;
|
||||
DROP INDEX IF EXISTS jobs_user_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_user_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_user_numacc;
|
||||
DROP INDEX IF EXISTS jobs_user_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_project;
|
||||
DROP INDEX IF EXISTS jobs_project_user;
|
||||
DROP INDEX IF EXISTS jobs_project_starttime;
|
||||
DROP INDEX IF EXISTS jobs_project_duration;
|
||||
DROP INDEX IF EXISTS jobs_project_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_project_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_project_numacc;
|
||||
DROP INDEX IF EXISTS jobs_project_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_numacc;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_arrayjobid_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_starttime;
|
||||
DROP INDEX IF EXISTS jobs_duration;
|
||||
DROP INDEX IF EXISTS jobs_numnodes;
|
||||
DROP INDEX IF EXISTS jobs_numhwthreads;
|
||||
DROP INDEX IF EXISTS jobs_numacc;
|
||||
DROP INDEX IF EXISTS jobs_energy;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_duration_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numnodes_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numhwthreads_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numacc_starttime;
|
||||
DROP INDEX IF EXISTS jobs_energy_starttime;
|
||||
142
internal/repository/migrations/sqlite3/08_add-footprint.up.sql
Normal file
142
internal/repository/migrations/sqlite3/08_add-footprint.up.sql
Normal file
@@ -0,0 +1,142 @@
|
||||
DROP INDEX IF EXISTS job_stats;
|
||||
DROP INDEX IF EXISTS job_by_user;
|
||||
DROP INDEX IF EXISTS job_by_starttime;
|
||||
DROP INDEX IF EXISTS job_by_job_id;
|
||||
DROP INDEX IF EXISTS job_list;
|
||||
DROP INDEX IF EXISTS job_list_user;
|
||||
DROP INDEX IF EXISTS job_list_users;
|
||||
DROP INDEX IF EXISTS job_list_users_start;
|
||||
|
||||
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
|
||||
ALTER TABLE job ADD COLUMN energy_footprint TEXT DEFAULT NULL;
|
||||
|
||||
ALTER TABLE job ADD COLUMN footprint TEXT DEFAULT NULL;
|
||||
ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global';
|
||||
|
||||
-- Do not use reserved keywords anymore
|
||||
ALTER TABLE "user" RENAME TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN "user" TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN "partition" TO cluster_partition;
|
||||
|
||||
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
|
||||
UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
|
||||
|
||||
ALTER TABLE job DROP flops_any_avg;
|
||||
ALTER TABLE job DROP mem_bw_avg;
|
||||
ALTER TABLE job DROP mem_used_max;
|
||||
ALTER TABLE job DROP load_avg;
|
||||
ALTER TABLE job DROP net_bw_avg;
|
||||
ALTER TABLE job DROP net_data_vol_total;
|
||||
ALTER TABLE job DROP file_bw_avg;
|
||||
ALTER TABLE job DROP file_data_vol_total;
|
||||
|
||||
-- Indices for: Single filters, combined filters, sorting, sorting with filters
|
||||
-- Cluster Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
|
||||
-- Cluster Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numhwthreads ON job (cluster, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numacc ON job (cluster, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_energy ON job (cluster, energy);
|
||||
|
||||
-- Cluster+Partition Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
|
||||
-- Cluster+Partition Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numhwthreads ON job (cluster, cluster_partition, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numacc ON job (cluster, cluster_partition, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_energy ON job (cluster, cluster_partition, energy);
|
||||
|
||||
-- Cluster+Partition+Jobstate Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
|
||||
-- Cluster+Partition+Jobstate Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numhwthreads ON job (cluster, cluster_partition, job_state, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numacc ON job (cluster, cluster_partition, job_state, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_energy ON job (cluster, cluster_partition, job_state, energy);
|
||||
|
||||
-- Cluster+JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
|
||||
-- Cluster+JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numhwthreads ON job (cluster, job_state, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numacc ON job (cluster, job_state, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_energy ON job (cluster, job_state, energy);
|
||||
|
||||
-- User Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
|
||||
-- User Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numhwthreads ON job (hpc_user, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numacc ON job (hpc_user, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_energy ON job (hpc_user, energy);
|
||||
|
||||
-- Project Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
|
||||
-- Project Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numhwthreads ON job (project, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numacc ON job (project, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_energy ON job (project, energy);
|
||||
|
||||
-- JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
|
||||
-- JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numhwthreads ON job (job_state, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numacc ON job (job_state, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_energy ON job (job_state, energy);
|
||||
|
||||
-- ArrayJob Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
|
||||
|
||||
-- Sorting without active filters
|
||||
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numhwthreads ON job (num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc ON job (num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy ON job (energy);
|
||||
|
||||
-- Single filters with default starttime sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numhwthreads_starttime ON job (num_hwthreads, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
|
||||
|
||||
-- Optimize DB index usage
|
||||
PRAGMA optimize;
|
||||
@@ -55,7 +55,7 @@ func BenchmarkDB_FindJobById(b *testing.B) {
|
||||
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
_, err := db.FindById(jobId)
|
||||
_, err := db.FindById(getContext(b), jobId)
|
||||
noErr(b, err)
|
||||
}
|
||||
})
|
||||
@@ -111,7 +111,7 @@ func BenchmarkDB_QueryJobs(b *testing.B) {
|
||||
user := "mppi133h"
|
||||
filter.User = &model.StringInput{Eq: &user}
|
||||
page := &model.PageRequest{ItemsPerPage: 50, Page: 1}
|
||||
order := &model.OrderByInput{Field: "startTime", Order: model.SortDirectionEnumDesc}
|
||||
order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc}
|
||||
|
||||
b.Run("QueryJobs", func(b *testing.B) {
|
||||
db := setup(b)
|
||||
|
||||
@@ -8,12 +8,11 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
@@ -22,7 +21,7 @@ import (
|
||||
|
||||
// GraphQL validation should make sure that no unkown values can be specified.
|
||||
var groupBy2column = map[model.Aggregate]string{
|
||||
model.AggregateUser: "job.user",
|
||||
model.AggregateUser: "job.hpc_user",
|
||||
model.AggregateProject: "job.project",
|
||||
model.AggregateCluster: "job.cluster",
|
||||
}
|
||||
@@ -41,8 +40,8 @@ var sortBy2column = map[model.SortByAggregate]string{
|
||||
func (r *JobRepository) buildCountQuery(
|
||||
filter []*model.JobFilter,
|
||||
kind string,
|
||||
col string) sq.SelectBuilder {
|
||||
|
||||
col string,
|
||||
) sq.SelectBuilder {
|
||||
var query sq.SelectBuilder
|
||||
|
||||
if col != "" {
|
||||
@@ -69,16 +68,16 @@ func (r *JobRepository) buildCountQuery(
|
||||
|
||||
func (r *JobRepository) buildStatsQuery(
|
||||
filter []*model.JobFilter,
|
||||
col string) sq.SelectBuilder {
|
||||
|
||||
col string,
|
||||
) sq.SelectBuilder {
|
||||
var query sq.SelectBuilder
|
||||
castType := r.getCastType()
|
||||
|
||||
// fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as %s) as value`, time.Now().Unix(), castType)
|
||||
|
||||
if col != "" {
|
||||
// Scan columns: id, totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
|
||||
query = sq.Select(col, "COUNT(job.id) as totalJobs",
|
||||
// Scan columns: id, totalJobs, name, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
|
||||
query = sq.Select(col, "COUNT(job.id) as totalJobs", "name",
|
||||
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s) as totalWalltime`, time.Now().Unix(), castType),
|
||||
fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s) as totalNodes`, castType),
|
||||
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as %s) as totalNodeHours`, time.Now().Unix(), castType),
|
||||
@@ -86,10 +85,9 @@ func (r *JobRepository) buildStatsQuery(
|
||||
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as %s) as totalCoreHours`, time.Now().Unix(), castType),
|
||||
fmt.Sprintf(`CAST(SUM(job.num_acc) as %s) as totalAccs`, castType),
|
||||
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as %s) as totalAccHours`, time.Now().Unix(), castType),
|
||||
).From("job").GroupBy(col)
|
||||
|
||||
).From("job").LeftJoin("hpc_user ON hpc_user.username = job.hpc_user").GroupBy(col)
|
||||
} else {
|
||||
// Scan columns: totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
|
||||
// Scan columns: totalJobs, name, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
|
||||
query = sq.Select("COUNT(job.id)",
|
||||
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s)`, time.Now().Unix(), castType),
|
||||
fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s)`, castType),
|
||||
@@ -108,15 +106,15 @@ func (r *JobRepository) buildStatsQuery(
|
||||
return query
|
||||
}
|
||||
|
||||
func (r *JobRepository) getUserName(ctx context.Context, id string) string {
|
||||
user := GetUserFromContext(ctx)
|
||||
name, _ := r.FindColumnValue(user, id, "user", "name", "username", false)
|
||||
if name != "" {
|
||||
return name
|
||||
} else {
|
||||
return "-"
|
||||
}
|
||||
}
|
||||
// func (r *JobRepository) getUserName(ctx context.Context, id string) string {
|
||||
// user := GetUserFromContext(ctx)
|
||||
// name, _ := r.FindColumnValue(user, id, "hpc_user", "name", "username", false)
|
||||
// if name != "" {
|
||||
// return name
|
||||
// } else {
|
||||
// return "-"
|
||||
// }
|
||||
// }
|
||||
|
||||
func (r *JobRepository) getCastType() string {
|
||||
var castType string
|
||||
@@ -138,8 +136,8 @@ func (r *JobRepository) JobsStatsGrouped(
|
||||
filter []*model.JobFilter,
|
||||
page *model.PageRequest,
|
||||
sortBy *model.SortByAggregate,
|
||||
groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
|
||||
|
||||
groupBy *model.Aggregate,
|
||||
) ([]*model.JobsStatistics, error) {
|
||||
start := time.Now()
|
||||
col := groupBy2column[*groupBy]
|
||||
query := r.buildStatsQuery(filter, col)
|
||||
@@ -168,14 +166,20 @@ func (r *JobRepository) JobsStatsGrouped(
|
||||
|
||||
for rows.Next() {
|
||||
var id sql.NullString
|
||||
var name sql.NullString
|
||||
var jobs, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64
|
||||
if err := rows.Scan(&id, &jobs, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil {
|
||||
if err := rows.Scan(&id, &jobs, &name, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
var totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours int
|
||||
var personName string
|
||||
|
||||
if name.Valid {
|
||||
personName = name.String
|
||||
}
|
||||
|
||||
if jobs.Valid {
|
||||
totalJobs = int(jobs.Int64)
|
||||
@@ -205,12 +209,12 @@ func (r *JobRepository) JobsStatsGrouped(
|
||||
totalAccHours = int(accHours.Int64)
|
||||
}
|
||||
|
||||
if col == "job.user" {
|
||||
name := r.getUserName(ctx, id.String)
|
||||
if col == "job.hpc_user" {
|
||||
// name := r.getUserName(ctx, id.String)
|
||||
stats = append(stats,
|
||||
&model.JobsStatistics{
|
||||
ID: id.String,
|
||||
Name: name,
|
||||
Name: personName,
|
||||
TotalJobs: totalJobs,
|
||||
TotalWalltime: totalWalltime,
|
||||
TotalNodes: totalNodes,
|
||||
@@ -218,7 +222,8 @@ func (r *JobRepository) JobsStatsGrouped(
|
||||
TotalCores: totalCores,
|
||||
TotalCoreHours: totalCoreHours,
|
||||
TotalAccs: totalAccs,
|
||||
TotalAccHours: totalAccHours})
|
||||
TotalAccHours: totalAccHours,
|
||||
})
|
||||
} else {
|
||||
stats = append(stats,
|
||||
&model.JobsStatistics{
|
||||
@@ -230,7 +235,8 @@ func (r *JobRepository) JobsStatsGrouped(
|
||||
TotalCores: totalCores,
|
||||
TotalCoreHours: totalCoreHours,
|
||||
TotalAccs: totalAccs,
|
||||
TotalAccHours: totalAccHours})
|
||||
TotalAccHours: totalAccHours,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -241,8 +247,8 @@ func (r *JobRepository) JobsStatsGrouped(
|
||||
|
||||
func (r *JobRepository) JobsStats(
|
||||
ctx context.Context,
|
||||
filter []*model.JobFilter) ([]*model.JobsStatistics, error) {
|
||||
|
||||
filter []*model.JobFilter,
|
||||
) ([]*model.JobsStatistics, error) {
|
||||
start := time.Now()
|
||||
query := r.buildStatsQuery(filter, "")
|
||||
query, err := SecurityCheck(ctx, query)
|
||||
@@ -277,18 +283,36 @@ func (r *JobRepository) JobsStats(
|
||||
TotalWalltime: int(walltime.Int64),
|
||||
TotalNodeHours: totalNodeHours,
|
||||
TotalCoreHours: totalCoreHours,
|
||||
TotalAccHours: totalAccHours})
|
||||
TotalAccHours: totalAccHours,
|
||||
})
|
||||
}
|
||||
|
||||
log.Debugf("Timer JobStats %s", time.Since(start))
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func LoadJobStat(job *schema.JobMeta, metric string, statType string) float64 {
|
||||
if stats, ok := job.Statistics[metric]; ok {
|
||||
switch statType {
|
||||
case "avg":
|
||||
return stats.Avg
|
||||
case "max":
|
||||
return stats.Max
|
||||
case "min":
|
||||
return stats.Min
|
||||
default:
|
||||
log.Errorf("Unknown stat type %s", statType)
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0
|
||||
}
|
||||
|
||||
func (r *JobRepository) JobCountGrouped(
|
||||
ctx context.Context,
|
||||
filter []*model.JobFilter,
|
||||
groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
|
||||
|
||||
groupBy *model.Aggregate,
|
||||
) ([]*model.JobsStatistics, error) {
|
||||
start := time.Now()
|
||||
col := groupBy2column[*groupBy]
|
||||
query := r.buildCountQuery(filter, "", col)
|
||||
@@ -315,7 +339,8 @@ func (r *JobRepository) JobCountGrouped(
|
||||
stats = append(stats,
|
||||
&model.JobsStatistics{
|
||||
ID: id.String,
|
||||
TotalJobs: int(cnt.Int64)})
|
||||
TotalJobs: int(cnt.Int64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -328,8 +353,8 @@ func (r *JobRepository) AddJobCountGrouped(
|
||||
filter []*model.JobFilter,
|
||||
groupBy *model.Aggregate,
|
||||
stats []*model.JobsStatistics,
|
||||
kind string) ([]*model.JobsStatistics, error) {
|
||||
|
||||
kind string,
|
||||
) ([]*model.JobsStatistics, error) {
|
||||
start := time.Now()
|
||||
col := groupBy2column[*groupBy]
|
||||
query := r.buildCountQuery(filter, kind, col)
|
||||
@@ -376,8 +401,8 @@ func (r *JobRepository) AddJobCount(
|
||||
ctx context.Context,
|
||||
filter []*model.JobFilter,
|
||||
stats []*model.JobsStatistics,
|
||||
kind string) ([]*model.JobsStatistics, error) {
|
||||
|
||||
kind string,
|
||||
) ([]*model.JobsStatistics, error) {
|
||||
start := time.Now()
|
||||
query := r.buildCountQuery(filter, kind, "")
|
||||
query, err := SecurityCheck(ctx, query)
|
||||
@@ -420,15 +445,41 @@ func (r *JobRepository) AddJobCount(
|
||||
func (r *JobRepository) AddHistograms(
|
||||
ctx context.Context,
|
||||
filter []*model.JobFilter,
|
||||
stat *model.JobsStatistics) (*model.JobsStatistics, error) {
|
||||
stat *model.JobsStatistics,
|
||||
durationBins *string,
|
||||
) (*model.JobsStatistics, error) {
|
||||
start := time.Now()
|
||||
|
||||
var targetBinCount int
|
||||
var targetBinSize int
|
||||
switch {
|
||||
case *durationBins == "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes
|
||||
targetBinCount = 60
|
||||
targetBinSize = 60
|
||||
case *durationBins == "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours
|
||||
targetBinCount = 72
|
||||
targetBinSize = 600
|
||||
case *durationBins == "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours
|
||||
targetBinCount = 48
|
||||
targetBinSize = 3600
|
||||
case *durationBins == "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days
|
||||
targetBinCount = 12
|
||||
targetBinSize = 21600
|
||||
case *durationBins == "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days
|
||||
targetBinCount = 14
|
||||
targetBinSize = 43200
|
||||
default: // 24h
|
||||
targetBinCount = 24
|
||||
targetBinSize = 3600
|
||||
}
|
||||
|
||||
castType := r.getCastType()
|
||||
var err error
|
||||
value := fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as %s) as value`, time.Now().Unix(), castType)
|
||||
stat.HistDuration, err = r.jobsStatisticsHistogram(ctx, value, filter)
|
||||
// Return X-Values always as seconds, will be formatted into minutes and hours in frontend
|
||||
value := fmt.Sprintf(`CAST(ROUND(((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / %d) + 1) as %s) as value`, time.Now().Unix(), targetBinSize, castType)
|
||||
stat.HistDuration, err = r.jobsDurationStatisticsHistogram(ctx, value, filter, targetBinSize, &targetBinCount)
|
||||
if err != nil {
|
||||
log.Warn("Error while loading job statistics histogram: running jobs")
|
||||
log.Warn("Error while loading job statistics histogram: job duration")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -459,14 +510,16 @@ func (r *JobRepository) AddMetricHistograms(
|
||||
ctx context.Context,
|
||||
filter []*model.JobFilter,
|
||||
metrics []string,
|
||||
stat *model.JobsStatistics) (*model.JobsStatistics, error) {
|
||||
stat *model.JobsStatistics,
|
||||
targetBinCount *int,
|
||||
) (*model.JobsStatistics, error) {
|
||||
start := time.Now()
|
||||
|
||||
// Running Jobs Only: First query jobdata from sqlite, then query data and make bins
|
||||
for _, f := range filter {
|
||||
if f.State != nil {
|
||||
if len(f.State) == 1 && f.State[0] == "running" {
|
||||
stat.HistMetrics = r.runningJobsMetricStatisticsHistogram(ctx, metrics, filter)
|
||||
stat.HistMetrics = r.runningJobsMetricStatisticsHistogram(ctx, metrics, filter, targetBinCount)
|
||||
log.Debugf("Timer AddMetricHistograms %s", time.Since(start))
|
||||
return stat, nil
|
||||
}
|
||||
@@ -475,7 +528,7 @@ func (r *JobRepository) AddMetricHistograms(
|
||||
|
||||
// All other cases: Query and make bins in sqlite directly
|
||||
for _, m := range metrics {
|
||||
metricHisto, err := r.jobsMetricStatisticsHistogram(ctx, m, filter)
|
||||
metricHisto, err := r.jobsMetricStatisticsHistogram(ctx, m, filter, targetBinCount)
|
||||
if err != nil {
|
||||
log.Warnf("Error while loading job metric statistics histogram: %s", m)
|
||||
continue
|
||||
@@ -491,8 +544,8 @@ func (r *JobRepository) AddMetricHistograms(
|
||||
func (r *JobRepository) jobsStatisticsHistogram(
|
||||
ctx context.Context,
|
||||
value string,
|
||||
filters []*model.JobFilter) ([]*model.HistoPoint, error) {
|
||||
|
||||
filters []*model.JobFilter,
|
||||
) ([]*model.HistoPoint, error) {
|
||||
start := time.Now()
|
||||
query, qerr := SecurityCheck(ctx,
|
||||
sq.Select(value, "COUNT(job.id) AS count").From("job"))
|
||||
@@ -512,6 +565,7 @@ func (r *JobRepository) jobsStatisticsHistogram(
|
||||
}
|
||||
|
||||
points := make([]*model.HistoPoint, 0)
|
||||
// is it possible to introduce zero values here? requires info about bincount
|
||||
for rows.Next() {
|
||||
point := model.HistoPoint{}
|
||||
if err := rows.Scan(&point.Value, &point.Count); err != nil {
|
||||
@@ -525,39 +579,79 @@ func (r *JobRepository) jobsStatisticsHistogram(
|
||||
return points, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) jobsDurationStatisticsHistogram(
|
||||
ctx context.Context,
|
||||
value string,
|
||||
filters []*model.JobFilter,
|
||||
binSizeSeconds int,
|
||||
targetBinCount *int,
|
||||
) ([]*model.HistoPoint, error) {
|
||||
start := time.Now()
|
||||
query, qerr := SecurityCheck(ctx,
|
||||
sq.Select(value, "COUNT(job.id) AS count").From("job"))
|
||||
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
// Setup Array
|
||||
points := make([]*model.HistoPoint, 0)
|
||||
for i := 1; i <= *targetBinCount; i++ {
|
||||
point := model.HistoPoint{Value: i * binSizeSeconds, Count: 0}
|
||||
points = append(points, &point)
|
||||
}
|
||||
|
||||
for _, f := range filters {
|
||||
query = BuildWhereClause(f, query)
|
||||
}
|
||||
|
||||
rows, err := query.GroupBy("value").RunWith(r.DB).Query()
|
||||
if err != nil {
|
||||
log.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Fill Array at matching $Value
|
||||
for rows.Next() {
|
||||
point := model.HistoPoint{}
|
||||
if err := rows.Scan(&point.Value, &point.Count); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, e := range points {
|
||||
if e.Value == (point.Value * binSizeSeconds) {
|
||||
// Note:
|
||||
// Matching on unmodified integer value (and multiplying point.Value by binSizeSeconds after match)
|
||||
// causes frontend to loop into highest targetBinCount, due to zoom condition instantly being fullfilled (cause unknown)
|
||||
e.Count = point.Count
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start))
|
||||
return points, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) jobsMetricStatisticsHistogram(
|
||||
ctx context.Context,
|
||||
metric string,
|
||||
filters []*model.JobFilter) (*model.MetricHistoPoints, error) {
|
||||
|
||||
var dbMetric string
|
||||
switch metric {
|
||||
case "cpu_load":
|
||||
dbMetric = "load_avg"
|
||||
case "flops_any":
|
||||
dbMetric = "flops_any_avg"
|
||||
case "mem_bw":
|
||||
dbMetric = "mem_bw_avg"
|
||||
case "mem_used":
|
||||
dbMetric = "mem_used_max"
|
||||
case "net_bw":
|
||||
dbMetric = "net_bw_avg"
|
||||
case "file_bw":
|
||||
dbMetric = "file_bw_avg"
|
||||
default:
|
||||
return nil, fmt.Errorf("%s not implemented", metric)
|
||||
}
|
||||
|
||||
filters []*model.JobFilter,
|
||||
bins *int,
|
||||
) (*model.MetricHistoPoints, error) {
|
||||
// Get specific Peak or largest Peak
|
||||
var metricConfig *schema.MetricConfig
|
||||
var peak float64 = 0.0
|
||||
var unit string = ""
|
||||
var peak float64
|
||||
var unit string
|
||||
var footprintStat string
|
||||
|
||||
for _, f := range filters {
|
||||
if f.Cluster != nil {
|
||||
metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric)
|
||||
peak = metricConfig.Peak
|
||||
unit = metricConfig.Unit.Prefix + metricConfig.Unit.Base
|
||||
footprintStat = metricConfig.Footprint
|
||||
log.Debugf("Cluster %s filter found with peak %f for %s", *f.Cluster.Eq, peak, metric)
|
||||
}
|
||||
}
|
||||
@@ -572,23 +666,29 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
|
||||
if unit == "" {
|
||||
unit = m.Unit.Prefix + m.Unit.Base
|
||||
}
|
||||
if footprintStat == "" {
|
||||
footprintStat = m.Footprint
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// log.Debugf("Metric %s: DB %s, Peak %f, Unit %s", metric, dbMetric, peak, unit)
|
||||
// log.Debugf("Metric %s, Peak %f, Unit %s, Aggregation %s", metric, peak, unit, aggreg)
|
||||
// Make bins, see https://jereze.com/code/sql-histogram/
|
||||
|
||||
start := time.Now()
|
||||
jm := fmt.Sprintf(`json_extract(footprint, "$.%s")`, (metric + "_" + footprintStat))
|
||||
|
||||
crossJoinQuery := sq.Select(
|
||||
fmt.Sprintf(`max(%s) as max`, dbMetric),
|
||||
fmt.Sprintf(`min(%s) as min`, dbMetric),
|
||||
fmt.Sprintf(`max(%s) as max`, jm),
|
||||
fmt.Sprintf(`min(%s) as min`, jm),
|
||||
).From("job").Where(
|
||||
fmt.Sprintf(`%s is not null`, dbMetric),
|
||||
"JSON_VALID(footprint)",
|
||||
).Where(
|
||||
fmt.Sprintf(`%s <= %f`, dbMetric, peak),
|
||||
fmt.Sprintf(`%s is not null`, jm),
|
||||
).Where(
|
||||
fmt.Sprintf(`%s <= %f`, jm, peak),
|
||||
)
|
||||
|
||||
crossJoinQuery, cjqerr := SecurityCheck(ctx, crossJoinQuery)
|
||||
@@ -606,17 +706,18 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
|
||||
return nil, sqlerr
|
||||
}
|
||||
|
||||
bins := 10
|
||||
binQuery := fmt.Sprintf(`CAST( (case when job.%s = value.max then value.max*0.999999999 else job.%s end - value.min) / (value.max - value.min) * %d as INTEGER )`, dbMetric, dbMetric, bins)
|
||||
binQuery := fmt.Sprintf(`CAST( (case when %s = value.max
|
||||
then value.max*0.999999999 else %s end - value.min) / (value.max -
|
||||
value.min) * %v as INTEGER )`, jm, jm, *bins)
|
||||
|
||||
mainQuery := sq.Select(
|
||||
fmt.Sprintf(`%s + 1 as bin`, binQuery),
|
||||
fmt.Sprintf(`count(job.%s) as count`, dbMetric),
|
||||
fmt.Sprintf(`CAST(((value.max / %d) * (%s )) as INTEGER ) as min`, bins, binQuery),
|
||||
fmt.Sprintf(`CAST(((value.max / %d) * (%s + 1 )) as INTEGER ) as max`, bins, binQuery),
|
||||
fmt.Sprintf(`count(%s) as count`, jm),
|
||||
fmt.Sprintf(`CAST(((value.max / %d) * (%v )) as INTEGER ) as min`, *bins, binQuery),
|
||||
fmt.Sprintf(`CAST(((value.max / %d) * (%v + 1 )) as INTEGER ) as max`, *bins, binQuery),
|
||||
).From("job").CrossJoin(
|
||||
fmt.Sprintf(`(%s) as value`, crossJoinQuerySql), crossJoinQueryArgs...,
|
||||
).Where(fmt.Sprintf(`job.%s is not null and job.%s <= %f`, dbMetric, dbMetric, peak))
|
||||
).Where(fmt.Sprintf(`%s is not null and %s <= %f`, jm, jm, peak))
|
||||
|
||||
mainQuery, qerr := SecurityCheck(ctx, mainQuery)
|
||||
|
||||
@@ -637,18 +738,39 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Setup Array
|
||||
points := make([]*model.MetricHistoPoint, 0)
|
||||
for rows.Next() {
|
||||
point := model.MetricHistoPoint{}
|
||||
if err := rows.Scan(&point.Bin, &point.Count, &point.Min, &point.Max); err != nil {
|
||||
log.Warnf("Error while scanning rows for %s", metric)
|
||||
return nil, err // Totally bricks cc-backend if returned and if all metrics requested?
|
||||
}
|
||||
|
||||
for i := 1; i <= *bins; i++ {
|
||||
binMax := ((int(peak) / *bins) * i)
|
||||
binMin := ((int(peak) / *bins) * (i - 1))
|
||||
point := model.MetricHistoPoint{Bin: &i, Count: 0, Min: &binMin, Max: &binMax}
|
||||
points = append(points, &point)
|
||||
}
|
||||
|
||||
result := model.MetricHistoPoints{Metric: metric, Unit: unit, Data: points}
|
||||
for rows.Next() {
|
||||
point := model.MetricHistoPoint{}
|
||||
if err := rows.Scan(&point.Bin, &point.Count, &point.Min, &point.Max); err != nil {
|
||||
log.Warnf("Error while scanning rows for %s", jm)
|
||||
return nil, err // Totally bricks cc-backend if returned and if all metrics requested?
|
||||
}
|
||||
|
||||
for _, e := range points {
|
||||
if e.Bin != nil && point.Bin != nil {
|
||||
if *e.Bin == *point.Bin {
|
||||
e.Count = point.Count
|
||||
if point.Min != nil {
|
||||
e.Min = point.Min
|
||||
}
|
||||
if point.Max != nil {
|
||||
e.Max = point.Max
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result := model.MetricHistoPoints{Metric: metric, Unit: unit, Stat: &footprintStat, Data: points}
|
||||
|
||||
log.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start))
|
||||
return &result, nil
|
||||
@@ -657,7 +779,9 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
|
||||
func (r *JobRepository) runningJobsMetricStatisticsHistogram(
|
||||
ctx context.Context,
|
||||
metrics []string,
|
||||
filters []*model.JobFilter) []*model.MetricHistoPoints {
|
||||
filters []*model.JobFilter,
|
||||
bins *int,
|
||||
) []*model.MetricHistoPoints {
|
||||
|
||||
// Get Jobs
|
||||
jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil)
|
||||
@@ -681,7 +805,7 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram(
|
||||
continue
|
||||
}
|
||||
|
||||
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
log.Errorf("Error while loading averages for histogram: %s", err)
|
||||
return nil
|
||||
}
|
||||
@@ -692,15 +816,14 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram(
|
||||
for idx, metric := range metrics {
|
||||
// Get specific Peak or largest Peak
|
||||
var metricConfig *schema.MetricConfig
|
||||
var peak float64 = 0.0
|
||||
var unit string = ""
|
||||
var peak float64
|
||||
var unit string
|
||||
|
||||
for _, f := range filters {
|
||||
if f.Cluster != nil {
|
||||
metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric)
|
||||
peak = metricConfig.Peak
|
||||
unit = metricConfig.Unit.Prefix + metricConfig.Unit.Base
|
||||
log.Debugf("Cluster %s filter found with peak %f for %s", *f.Cluster.Eq, peak, metric)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -720,28 +843,24 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram(
|
||||
}
|
||||
|
||||
// Make and fill bins
|
||||
bins := 10.0
|
||||
peakBin := peak / bins
|
||||
peakBin := int(peak) / *bins
|
||||
|
||||
points := make([]*model.MetricHistoPoint, 0)
|
||||
for b := 0; b < 10; b++ {
|
||||
for b := 0; b < *bins; b++ {
|
||||
count := 0
|
||||
bindex := b + 1
|
||||
bmin := math.Round(peakBin * float64(b))
|
||||
bmax := math.Round(peakBin * (float64(b) + 1.0))
|
||||
bmin := peakBin * b
|
||||
bmax := peakBin * (b + 1)
|
||||
|
||||
// Iterate AVG values for indexed metric and count for bins
|
||||
for _, val := range avgs[idx] {
|
||||
if float64(val) >= bmin && float64(val) < bmax {
|
||||
if int(val) >= bmin && int(val) < bmax {
|
||||
count += 1
|
||||
}
|
||||
}
|
||||
|
||||
bminint := int(bmin)
|
||||
bmaxint := int(bmax)
|
||||
|
||||
// Append Bin to Metric Result Array
|
||||
point := model.MetricHistoPoint{Bin: &bindex, Count: count, Min: &bminint, Max: &bmaxint}
|
||||
point := model.MetricHistoPoint{Bin: &bindex, Count: count, Min: &bmin, Max: &bmax}
|
||||
points = append(points, &point)
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
@@ -14,7 +15,13 @@ import (
|
||||
)
|
||||
|
||||
// Add the tag with id `tagId` to the job with the database id `jobId`.
|
||||
func (r *JobRepository) AddTag(job int64, tag int64) ([]*schema.Tag, error) {
|
||||
func (r *JobRepository) AddTag(user *schema.User, job int64, tag int64) ([]*schema.Tag, error) {
|
||||
j, err := r.FindByIdWithUser(user, job)
|
||||
if err != nil {
|
||||
log.Warn("Error while finding job by id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(job, tag)
|
||||
|
||||
if _, err := q.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
@@ -23,49 +30,60 @@ func (r *JobRepository) AddTag(job int64, tag int64) ([]*schema.Tag, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
j, err := r.FindById(job)
|
||||
if err != nil {
|
||||
log.Warn("Error while finding job by id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags, err := r.GetTags(&job)
|
||||
tags, err := r.GetTags(user, &job)
|
||||
if err != nil {
|
||||
log.Warn("Error while getting tags for job")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return tags, archive.UpdateTags(j, tags)
|
||||
archiveTags, err := r.getArchiveTags(&job)
|
||||
if err != nil {
|
||||
log.Warn("Error while getting tags for job")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return tags, archive.UpdateTags(j, archiveTags)
|
||||
}
|
||||
|
||||
// Removes a tag from a job
|
||||
func (r *JobRepository) RemoveTag(job, tag int64) ([]*schema.Tag, error) {
|
||||
func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) {
|
||||
j, err := r.FindByIdWithUser(user, job)
|
||||
if err != nil {
|
||||
log.Warn("Error while finding job by id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
q := sq.Delete("jobtag").Where("jobtag.job_id = ?", job).Where("jobtag.tag_id = ?", tag)
|
||||
|
||||
if _, err := q.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
s, _, _ := q.ToSql()
|
||||
log.Errorf("Error adding tag with %s: %v", s, err)
|
||||
log.Errorf("Error removing tag with %s: %v", s, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
j, err := r.FindById(job)
|
||||
if err != nil {
|
||||
log.Warn("Error while finding job by id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags, err := r.GetTags(&job)
|
||||
tags, err := r.GetTags(user, &job)
|
||||
if err != nil {
|
||||
log.Warn("Error while getting tags for job")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return tags, archive.UpdateTags(j, tags)
|
||||
archiveTags, err := r.getArchiveTags(&job)
|
||||
if err != nil {
|
||||
log.Warn("Error while getting tags for job")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return tags, archive.UpdateTags(j, archiveTags)
|
||||
}
|
||||
|
||||
// CreateTag creates a new tag with the specified type and name and returns its database id.
|
||||
func (r *JobRepository) CreateTag(tagType string, tagName string) (tagId int64, err error) {
|
||||
q := sq.Insert("tag").Columns("tag_type", "tag_name").Values(tagType, tagName)
|
||||
func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope string) (tagId int64, err error) {
|
||||
// Default to "Global" scope if none defined
|
||||
if tagScope == "" {
|
||||
tagScope = "global"
|
||||
}
|
||||
|
||||
q := sq.Insert("tag").Columns("tag_type", "tag_name", "tag_scope").Values(tagType, tagName, tagScope)
|
||||
|
||||
res, err := q.RunWith(r.stmtCache).Exec()
|
||||
if err != nil {
|
||||
@@ -78,8 +96,9 @@ func (r *JobRepository) CreateTag(tagType string, tagName string) (tagId int64,
|
||||
}
|
||||
|
||||
func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts map[string]int, err error) {
|
||||
// Fetch all Tags in DB for Display in Frontend Tag-View
|
||||
tags = make([]schema.Tag, 0, 100)
|
||||
xrows, err := r.DB.Queryx("SELECT id, tag_type, tag_name FROM tag")
|
||||
xrows, err := r.DB.Queryx("SELECT id, tag_type, tag_name, tag_scope FROM tag")
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
@@ -89,22 +108,42 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts
|
||||
if err = xrows.StructScan(&t); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
tags = append(tags, t)
|
||||
|
||||
// Handle Scope Filtering: Tag Scope is Global, Private (== Username) or User is auth'd to view Admin Tags
|
||||
readable, err := r.checkScopeAuth(user, "read", t.Scope)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
if readable {
|
||||
tags = append(tags, t)
|
||||
}
|
||||
}
|
||||
|
||||
q := sq.Select("t.tag_name, count(jt.tag_id)").
|
||||
// Query and Count Jobs with attached Tags
|
||||
q := sq.Select("t.tag_name, t.id, count(jt.tag_id)").
|
||||
From("tag t").
|
||||
LeftJoin("jobtag jt ON t.id = jt.tag_id").
|
||||
GroupBy("t.tag_name")
|
||||
|
||||
// Handle Scope Filtering
|
||||
scopeList := "\"global\""
|
||||
if user != nil {
|
||||
scopeList += ",\"" + user.Username + "\""
|
||||
}
|
||||
if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
scopeList += ",\"admin\""
|
||||
}
|
||||
q = q.Where("t.tag_scope IN (" + scopeList + ")")
|
||||
|
||||
// Handle Job Ownership
|
||||
if user != nil && user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) { // ADMIN || SUPPORT: Count all jobs
|
||||
log.Debug("CountTags: User Admin or Support -> Count all Jobs for Tags")
|
||||
// log.Debug("CountTags: User Admin or Support -> Count all Jobs for Tags")
|
||||
// Unchanged: Needs to be own case still, due to UserRole/NoRole compatibility handling in else case
|
||||
} else if user != nil && user.HasRole(schema.RoleManager) { // MANAGER: Count own jobs plus project's jobs
|
||||
// Build ("project1", "project2", ...) list of variable length directly in SQL string
|
||||
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.user = ? OR job.project IN (\""+strings.Join(user.Projects, "\",\"")+"\"))", user.Username)
|
||||
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.hpc_user = ? OR job.project IN (\""+strings.Join(user.Projects, "\",\"")+"\"))", user.Username)
|
||||
} else if user != nil { // USER OR NO ROLE (Compatibility): Only count own jobs
|
||||
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.user = ?)", user.Username)
|
||||
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.hpc_user = ?)", user.Username)
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
@@ -115,29 +154,44 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts
|
||||
counts = make(map[string]int)
|
||||
for rows.Next() {
|
||||
var tagName string
|
||||
var tagId int
|
||||
var count int
|
||||
if err = rows.Scan(&tagName, &count); err != nil {
|
||||
if err = rows.Scan(&tagName, &tagId, &count); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
counts[tagName] = count
|
||||
// Use tagId as second Map-Key component to differentiate tags with identical names
|
||||
counts[fmt.Sprint(tagName, tagId)] = count
|
||||
}
|
||||
err = rows.Err()
|
||||
|
||||
return
|
||||
return tags, counts, err
|
||||
}
|
||||
|
||||
// AddTagOrCreate adds the tag with the specified type and name to the job with the database id `jobId`.
|
||||
// If such a tag does not yet exist, it is created.
|
||||
func (r *JobRepository) AddTagOrCreate(jobId int64, tagType string, tagName string) (tagId int64, err error) {
|
||||
tagId, exists := r.TagId(tagType, tagName)
|
||||
func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType string, tagName string, tagScope string) (tagId int64, err error) {
|
||||
// Default to "Global" scope if none defined
|
||||
if tagScope == "" {
|
||||
tagScope = "global"
|
||||
}
|
||||
|
||||
writable, err := r.checkScopeAuth(user, "write", tagScope)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if !writable {
|
||||
return 0, fmt.Errorf("cannot write tag scope with current authorization")
|
||||
}
|
||||
|
||||
tagId, exists := r.TagId(tagType, tagName, tagScope)
|
||||
if !exists {
|
||||
tagId, err = r.CreateTag(tagType, tagName)
|
||||
tagId, err = r.CreateTag(tagType, tagName, tagScope)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
if _, err := r.AddTag(jobId, tagId); err != nil {
|
||||
if _, err := r.AddTag(user, jobId, tagId); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
@@ -145,19 +199,19 @@ func (r *JobRepository) AddTagOrCreate(jobId int64, tagType string, tagName stri
|
||||
}
|
||||
|
||||
// TagId returns the database id of the tag with the specified type and name.
|
||||
func (r *JobRepository) TagId(tagType string, tagName string) (tagId int64, exists bool) {
|
||||
func (r *JobRepository) TagId(tagType string, tagName string, tagScope string) (tagId int64, exists bool) {
|
||||
exists = true
|
||||
if err := sq.Select("id").From("tag").
|
||||
Where("tag.tag_type = ?", tagType).Where("tag.tag_name = ?", tagName).
|
||||
Where("tag.tag_type = ?", tagType).Where("tag.tag_name = ?", tagName).Where("tag.tag_scope = ?", tagScope).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&tagId); err != nil {
|
||||
exists = false
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// GetTags returns a list of all tags if job is nil or of the tags that the job with that database ID has.
|
||||
func (r *JobRepository) GetTags(job *int64) ([]*schema.Tag, error) {
|
||||
q := sq.Select("id", "tag_type", "tag_name").From("tag")
|
||||
// GetTags returns a list of all scoped tags if job is nil or of the tags that the job with that database ID has.
|
||||
func (r *JobRepository) GetTags(user *schema.User, job *int64) ([]*schema.Tag, error) {
|
||||
q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag")
|
||||
if job != nil {
|
||||
q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job)
|
||||
}
|
||||
@@ -172,7 +226,41 @@ func (r *JobRepository) GetTags(job *int64) ([]*schema.Tag, error) {
|
||||
tags := make([]*schema.Tag, 0)
|
||||
for rows.Next() {
|
||||
tag := &schema.Tag{}
|
||||
if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name); err != nil {
|
||||
if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
// Handle Scope Filtering: Tag Scope is Global, Private (== Username) or User is auth'd to view Admin Tags
|
||||
readable, err := r.checkScopeAuth(user, "read", tag.Scope)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if readable {
|
||||
tags = append(tags, tag)
|
||||
}
|
||||
}
|
||||
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
// GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has.
|
||||
func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) {
|
||||
q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag")
|
||||
if job != nil {
|
||||
q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job)
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
s, _, _ := q.ToSql()
|
||||
log.Errorf("Error get tags with %s: %v", s, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags := make([]*schema.Tag, 0)
|
||||
for rows.Next() {
|
||||
tag := &schema.Tag{}
|
||||
if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil {
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
@@ -181,3 +269,59 @@ func (r *JobRepository) GetTags(job *int64) ([]*schema.Tag, error) {
|
||||
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) ImportTag(jobId int64, tagType string, tagName string, tagScope string) (err error) {
|
||||
// Import has no scope ctx, only import from metafile to DB (No recursive archive update required), only returns err
|
||||
|
||||
tagId, exists := r.TagId(tagType, tagName, tagScope)
|
||||
if !exists {
|
||||
tagId, err = r.CreateTag(tagType, tagName, tagScope)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobId, tagId)
|
||||
|
||||
if _, err := q.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
s, _, _ := q.ToSql()
|
||||
log.Errorf("Error adding tag on import with %s: %v", s, err)
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) checkScopeAuth(user *schema.User, operation string, scope string) (pass bool, err error) {
|
||||
if user != nil {
|
||||
switch {
|
||||
case operation == "write" && scope == "admin":
|
||||
if user.HasRole(schema.RoleAdmin) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) {
|
||||
return true, nil
|
||||
}
|
||||
return false, nil
|
||||
case operation == "write" && scope == "global":
|
||||
if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) {
|
||||
return true, nil
|
||||
}
|
||||
return false, nil
|
||||
case operation == "write" && scope == user.Username:
|
||||
return true, nil
|
||||
case operation == "read" && scope == "admin":
|
||||
return user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}), nil
|
||||
case operation == "read" && scope == "global":
|
||||
return true, nil
|
||||
case operation == "read" && scope == user.Username:
|
||||
return true, nil
|
||||
default:
|
||||
if operation == "read" || operation == "write" {
|
||||
// No acceptable scope: deny tag
|
||||
return false, nil
|
||||
} else {
|
||||
return false, fmt.Errorf("error while checking tag operation auth: unknown operation (%s)", operation)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return false, fmt.Errorf("error while checking tag operation auth: no user in context")
|
||||
}
|
||||
}
|
||||
|
||||
BIN
internal/repository/testdata/job.db
vendored
BIN
internal/repository/testdata/job.db
vendored
Binary file not shown.
BIN
internal/repository/testdata/job.db-shm
vendored
BIN
internal/repository/testdata/job.db-shm
vendored
Binary file not shown.
0
internal/repository/testdata/job.db-wal
vendored
0
internal/repository/testdata/job.db-wal
vendored
@@ -6,7 +6,6 @@ package repository
|
||||
|
||||
import (
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
@@ -18,20 +17,12 @@ type Transaction struct {
|
||||
func (r *JobRepository) TransactionInit() (*Transaction, error) {
|
||||
var err error
|
||||
t := new(Transaction)
|
||||
// Inserts are bundled into transactions because in sqlite,
|
||||
// that speeds up inserts A LOT.
|
||||
|
||||
t.tx, err = r.DB.Beginx()
|
||||
if err != nil {
|
||||
log.Warn("Error while bundling transactions")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
t.stmt, err = t.tx.PrepareNamed(NamedJobInsert)
|
||||
if err != nil {
|
||||
log.Warn("Error while preparing namedJobInsert")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return t, nil
|
||||
}
|
||||
|
||||
@@ -50,7 +41,6 @@ func (r *JobRepository) TransactionCommit(t *Transaction) error {
|
||||
return err
|
||||
}
|
||||
|
||||
t.stmt = t.tx.NamedStmt(t.stmt)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -59,14 +49,17 @@ func (r *JobRepository) TransactionEnd(t *Transaction) error {
|
||||
log.Warn("Error while committing SQL transactions")
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) TransactionAdd(t *Transaction, job schema.Job) (int64, error) {
|
||||
res, err := t.stmt.Exec(job)
|
||||
func (r *JobRepository) TransactionAddNamed(
|
||||
t *Transaction,
|
||||
query string,
|
||||
args ...interface{},
|
||||
) (int64, error) {
|
||||
res, err := t.tx.NamedExec(query, args)
|
||||
if err != nil {
|
||||
log.Errorf("repository initDB(): %v", err)
|
||||
log.Errorf("Named Exec failed: %v", err)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
@@ -79,26 +72,19 @@ func (r *JobRepository) TransactionAdd(t *Transaction, job schema.Job) (int64, e
|
||||
return id, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) TransactionAddTag(t *Transaction, tag *schema.Tag) (int64, error) {
|
||||
res, err := t.tx.Exec(`INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)`, tag.Name, tag.Type)
|
||||
func (r *JobRepository) TransactionAdd(t *Transaction, query string, args ...interface{}) (int64, error) {
|
||||
|
||||
res, err := t.tx.Exec(query, args...)
|
||||
if err != nil {
|
||||
log.Errorf("Error while inserting tag into tag table: %v (Type %v)", tag.Name, tag.Type)
|
||||
return 0, err
|
||||
}
|
||||
tagId, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
log.Warn("Error while getting last insert ID")
|
||||
log.Errorf("TransactionAdd(), Exec() Error: %v", err)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return tagId, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) TransactionSetTag(t *Transaction, jobId int64, tagId int64) error {
|
||||
if _, err := t.tx.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)`, jobId, tagId); err != nil {
|
||||
log.Errorf("Error while inserting jobtag into jobtag table: %v (TagID %v)", jobId, tagId)
|
||||
return err
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
log.Errorf("TransactionAdd(), LastInsertId() Error: %v", err)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return nil
|
||||
return id, nil
|
||||
}
|
||||
|
||||
@@ -46,8 +46,8 @@ func GetUserRepository() *UserRepository {
|
||||
func (r *UserRepository) GetUser(username string) (*schema.User, error) {
|
||||
user := &schema.User{Username: username}
|
||||
var hashedPassword, name, rawRoles, email, rawProjects sql.NullString
|
||||
if err := sq.Select("password", "ldap", "name", "roles", "email", "projects").From("user").
|
||||
Where("user.username = ?", username).RunWith(r.DB).
|
||||
if err := sq.Select("password", "ldap", "name", "roles", "email", "projects").From("hpc_user").
|
||||
Where("hpc_user.username = ?", username).RunWith(r.DB).
|
||||
QueryRow().Scan(&hashedPassword, &user.AuthSource, &name, &rawRoles, &email, &rawProjects); err != nil {
|
||||
log.Warnf("Error while querying user '%v' from database", username)
|
||||
return nil, err
|
||||
@@ -72,9 +72,8 @@ func (r *UserRepository) GetUser(username string) (*schema.User, error) {
|
||||
}
|
||||
|
||||
func (r *UserRepository) GetLdapUsernames() ([]string, error) {
|
||||
|
||||
var users []string
|
||||
rows, err := r.DB.Query(`SELECT username FROM user WHERE user.ldap = 1`)
|
||||
rows, err := r.DB.Query(`SELECT username FROM hpc_user WHERE hpc_user.ldap = 1`)
|
||||
if err != nil {
|
||||
log.Warn("Error while querying usernames")
|
||||
return nil, err
|
||||
@@ -122,7 +121,7 @@ func (r *UserRepository) AddUser(user *schema.User) error {
|
||||
vals = append(vals, int(user.AuthSource))
|
||||
}
|
||||
|
||||
if _, err := sq.Insert("user").Columns(cols...).Values(vals...).RunWith(r.DB).Exec(); err != nil {
|
||||
if _, err := sq.Insert("hpc_user").Columns(cols...).Values(vals...).RunWith(r.DB).Exec(); err != nil {
|
||||
log.Errorf("Error while inserting new user '%v' into DB", user.Username)
|
||||
return err
|
||||
}
|
||||
@@ -131,9 +130,29 @@ func (r *UserRepository) AddUser(user *schema.User) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *UserRepository) DelUser(username string) error {
|
||||
func (r *UserRepository) UpdateUser(dbUser *schema.User, user *schema.User) error {
|
||||
// user contains updated info, apply to dbuser
|
||||
// TODO: Discuss updatable fields
|
||||
if dbUser.Name != user.Name {
|
||||
if _, err := sq.Update("hpc_user").Set("name", user.Name).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil {
|
||||
log.Errorf("error while updating name of user '%s'", user.Username)
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
_, err := r.DB.Exec(`DELETE FROM user WHERE user.username = ?`, username)
|
||||
// Toggled until greenlit
|
||||
// if dbUser.HasRole(schema.RoleManager) && !reflect.DeepEqual(dbUser.Projects, user.Projects) {
|
||||
// projects, _ := json.Marshal(user.Projects)
|
||||
// if _, err := sq.Update("hpc_user").Set("projects", projects).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil {
|
||||
// return err
|
||||
// }
|
||||
// }
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *UserRepository) DelUser(username string) error {
|
||||
_, err := r.DB.Exec(`DELETE FROM hpc_user WHERE hpc_user.username = ?`, username)
|
||||
if err != nil {
|
||||
log.Errorf("Error while deleting user '%s' from DB", username)
|
||||
return err
|
||||
@@ -143,8 +162,7 @@ func (r *UserRepository) DelUser(username string) error {
|
||||
}
|
||||
|
||||
func (r *UserRepository) ListUsers(specialsOnly bool) ([]*schema.User, error) {
|
||||
|
||||
q := sq.Select("username", "name", "email", "roles", "projects").From("user")
|
||||
q := sq.Select("username", "name", "email", "roles", "projects").From("hpc_user")
|
||||
if specialsOnly {
|
||||
q = q.Where("(roles != '[\"user\"]' AND roles != '[]')")
|
||||
}
|
||||
@@ -186,8 +204,8 @@ func (r *UserRepository) ListUsers(specialsOnly bool) ([]*schema.User, error) {
|
||||
func (r *UserRepository) AddRole(
|
||||
ctx context.Context,
|
||||
username string,
|
||||
queryrole string) error {
|
||||
|
||||
queryrole string,
|
||||
) error {
|
||||
newRole := strings.ToLower(queryrole)
|
||||
user, err := r.GetUser(username)
|
||||
if err != nil {
|
||||
@@ -198,15 +216,15 @@ func (r *UserRepository) AddRole(
|
||||
exists, valid := user.HasValidRole(newRole)
|
||||
|
||||
if !valid {
|
||||
return fmt.Errorf("Supplied role is no valid option : %v", newRole)
|
||||
return fmt.Errorf("supplied role is no valid option : %v", newRole)
|
||||
}
|
||||
if exists {
|
||||
return fmt.Errorf("User %v already has role %v", username, newRole)
|
||||
return fmt.Errorf("user %v already has role %v", username, newRole)
|
||||
}
|
||||
|
||||
roles, _ := json.Marshal(append(user.Roles, newRole))
|
||||
if _, err := sq.Update("user").Set("roles", roles).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
|
||||
log.Errorf("Error while adding new role for user '%s'", user.Username)
|
||||
if _, err := sq.Update("hpc_user").Set("roles", roles).Where("hpc_user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
|
||||
log.Errorf("error while adding new role for user '%s'", user.Username)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
@@ -223,14 +241,14 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr
|
||||
exists, valid := user.HasValidRole(oldRole)
|
||||
|
||||
if !valid {
|
||||
return fmt.Errorf("Supplied role is no valid option : %v", oldRole)
|
||||
return fmt.Errorf("supplied role is no valid option : %v", oldRole)
|
||||
}
|
||||
if !exists {
|
||||
return fmt.Errorf("Role already deleted for user '%v': %v", username, oldRole)
|
||||
return fmt.Errorf("role already deleted for user '%v': %v", username, oldRole)
|
||||
}
|
||||
|
||||
if oldRole == schema.GetRoleString(schema.RoleManager) && len(user.Projects) != 0 {
|
||||
return fmt.Errorf("Cannot remove role 'manager' while user %s still has assigned project(s) : %v", username, user.Projects)
|
||||
return fmt.Errorf("cannot remove role 'manager' while user %s still has assigned project(s) : %v", username, user.Projects)
|
||||
}
|
||||
|
||||
var newroles []string
|
||||
@@ -240,8 +258,8 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr
|
||||
}
|
||||
}
|
||||
|
||||
var mroles, _ = json.Marshal(newroles)
|
||||
if _, err := sq.Update("user").Set("roles", mroles).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
|
||||
mroles, _ := json.Marshal(newroles)
|
||||
if _, err := sq.Update("hpc_user").Set("roles", mroles).Where("hpc_user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
|
||||
log.Errorf("Error while removing role for user '%s'", user.Username)
|
||||
return err
|
||||
}
|
||||
@@ -251,15 +269,15 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr
|
||||
func (r *UserRepository) AddProject(
|
||||
ctx context.Context,
|
||||
username string,
|
||||
project string) error {
|
||||
|
||||
project string,
|
||||
) error {
|
||||
user, err := r.GetUser(username)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !user.HasRole(schema.RoleManager) {
|
||||
return fmt.Errorf("user '%s' is not a manager!", username)
|
||||
return fmt.Errorf("user '%s' is not a manager", username)
|
||||
}
|
||||
|
||||
if user.HasProject(project) {
|
||||
@@ -267,7 +285,7 @@ func (r *UserRepository) AddProject(
|
||||
}
|
||||
|
||||
projects, _ := json.Marshal(append(user.Projects, project))
|
||||
if _, err := sq.Update("user").Set("projects", projects).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
|
||||
if _, err := sq.Update("hpc_user").Set("projects", projects).Where("hpc_user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -281,11 +299,11 @@ func (r *UserRepository) RemoveProject(ctx context.Context, username string, pro
|
||||
}
|
||||
|
||||
if !user.HasRole(schema.RoleManager) {
|
||||
return fmt.Errorf("user '%#v' is not a manager!", username)
|
||||
return fmt.Errorf("user '%#v' is not a manager", username)
|
||||
}
|
||||
|
||||
if !user.HasProject(project) {
|
||||
return fmt.Errorf("user '%#v': Cannot remove project '%#v' - Does not match!", username, project)
|
||||
return fmt.Errorf("user '%#v': Cannot remove project '%#v' - Does not match", username, project)
|
||||
}
|
||||
|
||||
var exists bool
|
||||
@@ -298,14 +316,14 @@ func (r *UserRepository) RemoveProject(ctx context.Context, username string, pro
|
||||
}
|
||||
}
|
||||
|
||||
if exists == true {
|
||||
if exists {
|
||||
var result interface{}
|
||||
if len(newprojects) == 0 {
|
||||
result = "[]"
|
||||
} else {
|
||||
result, _ = json.Marshal(newprojects)
|
||||
}
|
||||
if _, err := sq.Update("user").Set("projects", result).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
|
||||
if _, err := sq.Update("hpc_user").Set("projects", result).Where("hpc_user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
@@ -321,9 +339,10 @@ const ContextUserKey ContextKey = "user"
|
||||
func GetUserFromContext(ctx context.Context) *schema.User {
|
||||
x := ctx.Value(ContextUserKey)
|
||||
if x == nil {
|
||||
log.Warnf("no user retrieved from context")
|
||||
return nil
|
||||
}
|
||||
|
||||
// log.Infof("user retrieved from context: %v", x.(*schema.User))
|
||||
return x.(*schema.User)
|
||||
}
|
||||
|
||||
@@ -336,7 +355,7 @@ func (r *UserRepository) FetchUserInCtx(ctx context.Context, username string) (*
|
||||
|
||||
user := &model.User{Username: username}
|
||||
var name, email sql.NullString
|
||||
if err := sq.Select("name", "email").From("user").Where("user.username = ?", username).
|
||||
if err := sq.Select("name", "email").From("hpc_user").Where("hpc_user.username = ?", username).
|
||||
RunWith(r.DB).QueryRow().Scan(&name, &email); err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
/* This warning will be logged *often* for non-local users, i.e. users mentioned only in job-table or archive, */
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/util"
|
||||
@@ -34,32 +35,38 @@ type Route struct {
|
||||
|
||||
var routes []Route = []Route{
|
||||
{"/", "home.tmpl", "ClusterCockpit", false, setupHomeRoute},
|
||||
{"/config", "config.tmpl", "Settings", false, func(i InfoType, r *http.Request) InfoType { return i }},
|
||||
{"/config", "config.tmpl", "Settings", false, setupConfigRoute},
|
||||
{"/monitoring/jobs/", "monitoring/jobs.tmpl", "Jobs - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { return i }},
|
||||
{"/monitoring/job/{id:[0-9]+}", "monitoring/job.tmpl", "Job <ID> - ClusterCockpit", false, setupJobRoute},
|
||||
{"/monitoring/users/", "monitoring/list.tmpl", "Users - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { i["listType"] = "USER"; return i }},
|
||||
{"/monitoring/projects/", "monitoring/list.tmpl", "Projects - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { i["listType"] = "PROJECT"; return i }},
|
||||
{"/monitoring/tags/", "monitoring/taglist.tmpl", "Tags - ClusterCockpit", false, setupTaglistRoute},
|
||||
{"/monitoring/user/{id}", "monitoring/user.tmpl", "User <ID> - ClusterCockpit", true, setupUserRoute},
|
||||
{"/monitoring/systems/{cluster}", "monitoring/systems.tmpl", "Cluster <ID> - ClusterCockpit", false, setupClusterRoute},
|
||||
{"/monitoring/systems/{cluster}", "monitoring/systems.tmpl", "Cluster <ID> Node Overview - ClusterCockpit", false, setupClusterOverviewRoute},
|
||||
{"/monitoring/systems/list/{cluster}", "monitoring/systems.tmpl", "Cluster <ID> Node List - ClusterCockpit", false, setupClusterListRoute},
|
||||
{"/monitoring/systems/list/{cluster}/{subcluster}", "monitoring/systems.tmpl", "Cluster <ID> <SID> Node List - ClusterCockpit", false, setupClusterListRoute},
|
||||
{"/monitoring/node/{cluster}/{hostname}", "monitoring/node.tmpl", "Node <ID> - ClusterCockpit", false, setupNodeRoute},
|
||||
{"/monitoring/analysis/{cluster}", "monitoring/analysis.tmpl", "Analysis - ClusterCockpit", true, setupAnalysisRoute},
|
||||
{"/monitoring/status/{cluster}", "monitoring/status.tmpl", "Status of <ID> - ClusterCockpit", false, setupClusterRoute},
|
||||
{"/monitoring/status/{cluster}", "monitoring/status.tmpl", "Status of <ID> - ClusterCockpit", false, setupClusterStatusRoute},
|
||||
}
|
||||
|
||||
func setupHomeRoute(i InfoType, r *http.Request) InfoType {
|
||||
jobRepo := repository.GetJobRepository()
|
||||
groupBy := model.AggregateCluster
|
||||
|
||||
// startJobCount := time.Now()
|
||||
stats, err := jobRepo.JobCountGrouped(r.Context(), nil, &groupBy)
|
||||
if err != nil {
|
||||
log.Warnf("failed to count jobs: %s", err.Error())
|
||||
}
|
||||
// log.Infof("Timer HOME ROUTE startJobCount: %s", time.Since(startJobCount))
|
||||
|
||||
// startRunningJobCount := time.Now()
|
||||
stats, err = jobRepo.AddJobCountGrouped(r.Context(), nil, &groupBy, stats, "running")
|
||||
if err != nil {
|
||||
log.Warnf("failed to count running jobs: %s", err.Error())
|
||||
}
|
||||
// log.Infof("Timer HOME ROUTE startRunningJobCount: %s", time.Since(startRunningJobCount))
|
||||
|
||||
i["clusters"] = stats
|
||||
|
||||
@@ -75,8 +82,22 @@ func setupHomeRoute(i InfoType, r *http.Request) InfoType {
|
||||
return i
|
||||
}
|
||||
|
||||
func setupConfigRoute(i InfoType, r *http.Request) InfoType {
|
||||
if util.CheckFileExists("./var/notice.txt") {
|
||||
msg, err := os.ReadFile("./var/notice.txt")
|
||||
if err == nil {
|
||||
i["ncontent"] = string(msg)
|
||||
}
|
||||
}
|
||||
|
||||
return i
|
||||
}
|
||||
|
||||
func setupJobRoute(i InfoType, r *http.Request) InfoType {
|
||||
i["id"] = mux.Vars(r)["id"]
|
||||
if config.Keys.EmissionConstant != 0 {
|
||||
i["emission"] = config.Keys.EmissionConstant
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
@@ -92,7 +113,7 @@ func setupUserRoute(i InfoType, r *http.Request) InfoType {
|
||||
return i
|
||||
}
|
||||
|
||||
func setupClusterRoute(i InfoType, r *http.Request) InfoType {
|
||||
func setupClusterStatusRoute(i InfoType, r *http.Request) InfoType {
|
||||
vars := mux.Vars(r)
|
||||
i["id"] = vars["cluster"]
|
||||
i["cluster"] = vars["cluster"]
|
||||
@@ -104,6 +125,36 @@ func setupClusterRoute(i InfoType, r *http.Request) InfoType {
|
||||
return i
|
||||
}
|
||||
|
||||
func setupClusterOverviewRoute(i InfoType, r *http.Request) InfoType {
|
||||
vars := mux.Vars(r)
|
||||
i["id"] = vars["cluster"]
|
||||
i["cluster"] = vars["cluster"]
|
||||
i["displayType"] = "OVERVIEW"
|
||||
|
||||
from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to")
|
||||
if from != "" || to != "" {
|
||||
i["from"] = from
|
||||
i["to"] = to
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
func setupClusterListRoute(i InfoType, r *http.Request) InfoType {
|
||||
vars := mux.Vars(r)
|
||||
i["id"] = vars["cluster"]
|
||||
i["cluster"] = vars["cluster"]
|
||||
i["sid"] = vars["subcluster"]
|
||||
i["subCluster"] = vars["subcluster"]
|
||||
i["displayType"] = "LIST"
|
||||
|
||||
from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to")
|
||||
if from != "" || to != "" {
|
||||
i["from"] = from
|
||||
i["to"] = to
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
func setupNodeRoute(i InfoType, r *http.Request) InfoType {
|
||||
vars := mux.Vars(r)
|
||||
i["cluster"] = vars["cluster"]
|
||||
@@ -124,28 +175,46 @@ func setupAnalysisRoute(i InfoType, r *http.Request) InfoType {
|
||||
|
||||
func setupTaglistRoute(i InfoType, r *http.Request) InfoType {
|
||||
jobRepo := repository.GetJobRepository()
|
||||
user := repository.GetUserFromContext(r.Context())
|
||||
|
||||
tags, counts, err := jobRepo.CountTags(user)
|
||||
tags, counts, err := jobRepo.CountTags(repository.GetUserFromContext(r.Context()))
|
||||
tagMap := make(map[string][]map[string]interface{})
|
||||
if err != nil {
|
||||
log.Warnf("GetTags failed: %s", err.Error())
|
||||
i["tagmap"] = tagMap
|
||||
return i
|
||||
}
|
||||
|
||||
for _, tag := range tags {
|
||||
tagItem := map[string]interface{}{
|
||||
"id": tag.ID,
|
||||
"name": tag.Name,
|
||||
"count": counts[tag.Name],
|
||||
// Reduces displayed tags for unauth'd users
|
||||
userAuthlevel := repository.GetUserFromContext(r.Context()).GetAuthLevel()
|
||||
// Uses tag.ID as second Map-Key component to differentiate tags with identical names
|
||||
if userAuthlevel >= 4 { // Support+ : Show tags for all scopes, regardless of count
|
||||
for _, tag := range tags {
|
||||
tagItem := map[string]interface{}{
|
||||
"id": tag.ID,
|
||||
"name": tag.Name,
|
||||
"scope": tag.Scope,
|
||||
"count": counts[fmt.Sprint(tag.Name, tag.ID)],
|
||||
}
|
||||
tagMap[tag.Type] = append(tagMap[tag.Type], tagItem)
|
||||
}
|
||||
tagMap[tag.Type] = append(tagMap[tag.Type], tagItem)
|
||||
}
|
||||
} else if userAuthlevel < 4 && userAuthlevel >= 2 { // User+ : Show global and admin scope only if at least 1 tag used, private scope regardless of count
|
||||
for _, tag := range tags {
|
||||
tagCount := counts[fmt.Sprint(tag.Name, tag.ID)]
|
||||
if ((tag.Scope == "global" || tag.Scope == "admin") && tagCount >= 1) || (tag.Scope != "global" && tag.Scope != "admin") {
|
||||
tagItem := map[string]interface{}{
|
||||
"id": tag.ID,
|
||||
"name": tag.Name,
|
||||
"scope": tag.Scope,
|
||||
"count": tagCount,
|
||||
}
|
||||
tagMap[tag.Type] = append(tagMap[tag.Type], tagItem)
|
||||
}
|
||||
}
|
||||
} // auth < 2 return nothing for this route
|
||||
|
||||
i["tagmap"] = tagMap
|
||||
return i
|
||||
}
|
||||
|
||||
// FIXME: Lots of redundant code. Needs refactoring
|
||||
func buildFilterPresets(query url.Values) map[string]interface{} {
|
||||
filterPresets := map[string]interface{}{}
|
||||
|
||||
@@ -208,6 +277,16 @@ func buildFilterPresets(query url.Values) map[string]interface{} {
|
||||
}
|
||||
}
|
||||
}
|
||||
if query.Get("numHWThreads") != "" {
|
||||
parts := strings.Split(query.Get("numHWThreads"), "-")
|
||||
if len(parts) == 2 {
|
||||
a, e1 := strconv.Atoi(parts[0])
|
||||
b, e2 := strconv.Atoi(parts[1])
|
||||
if e1 == nil && e2 == nil {
|
||||
filterPresets["numHWThreads"] = map[string]int{"from": a, "to": b}
|
||||
}
|
||||
}
|
||||
}
|
||||
if query.Get("numAccelerators") != "" {
|
||||
parts := strings.Split(query.Get("numAccelerators"), "-")
|
||||
if len(parts) == 2 {
|
||||
@@ -234,7 +313,7 @@ func buildFilterPresets(query url.Values) map[string]interface{} {
|
||||
}
|
||||
if query.Get("startTime") != "" {
|
||||
parts := strings.Split(query.Get("startTime"), "-")
|
||||
if len(parts) == 2 {
|
||||
if len(parts) == 2 { // Time in seconds, from - to
|
||||
a, e1 := strconv.ParseInt(parts[0], 10, 64)
|
||||
b, e2 := strconv.ParseInt(parts[1], 10, 64)
|
||||
if e1 == nil && e2 == nil {
|
||||
@@ -243,9 +322,41 @@ func buildFilterPresets(query url.Values) map[string]interface{} {
|
||||
"to": time.Unix(b, 0).Format(time.RFC3339),
|
||||
}
|
||||
}
|
||||
} else { // named range
|
||||
filterPresets["startTime"] = map[string]string{
|
||||
"range": query.Get("startTime"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if query.Get("energy") != "" {
|
||||
parts := strings.Split(query.Get("energy"), "-")
|
||||
if len(parts) == 2 {
|
||||
a, e1 := strconv.Atoi(parts[0])
|
||||
b, e2 := strconv.Atoi(parts[1])
|
||||
if e1 == nil && e2 == nil {
|
||||
filterPresets["energy"] = map[string]int{"from": a, "to": b}
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(query["stat"]) != 0 {
|
||||
statList := make([]map[string]interface{}, 0)
|
||||
for _, statEntry := range query["stat"] {
|
||||
parts := strings.Split(statEntry, "-")
|
||||
if len(parts) == 3 { // Metric Footprint Stat Field, from - to
|
||||
a, e1 := strconv.ParseInt(parts[1], 10, 64)
|
||||
b, e2 := strconv.ParseInt(parts[2], 10, 64)
|
||||
if e1 == nil && e2 == nil {
|
||||
statEntry := map[string]interface{}{
|
||||
"field": parts[0],
|
||||
"from": a,
|
||||
"to": b,
|
||||
}
|
||||
statList = append(statList, statEntry)
|
||||
}
|
||||
}
|
||||
}
|
||||
filterPresets["stats"] = statList
|
||||
}
|
||||
return filterPresets
|
||||
}
|
||||
|
||||
@@ -264,20 +375,25 @@ func SetupRoutes(router *mux.Router, buildInfo web.Build) {
|
||||
infos := route.Setup(map[string]interface{}{}, r)
|
||||
if id, ok := infos["id"]; ok {
|
||||
title = strings.Replace(route.Title, "<ID>", id.(string), 1)
|
||||
if sid, ok := infos["sid"]; ok { // 2nd ID element
|
||||
title = strings.Replace(title, "<SID>", sid.(string), 1)
|
||||
}
|
||||
}
|
||||
|
||||
// Get User -> What if NIL?
|
||||
user := repository.GetUserFromContext(r.Context())
|
||||
|
||||
// Get Roles
|
||||
availableRoles, _ := schema.GetValidRolesMap(user)
|
||||
|
||||
page := web.Page{
|
||||
Title: title,
|
||||
User: *user,
|
||||
Roles: availableRoles,
|
||||
Build: buildInfo,
|
||||
Config: conf,
|
||||
Infos: infos,
|
||||
Title: title,
|
||||
User: *user,
|
||||
Roles: availableRoles,
|
||||
Build: buildInfo,
|
||||
Config: conf,
|
||||
Resampling: config.Keys.EnableResampling,
|
||||
Infos: infos,
|
||||
}
|
||||
|
||||
if route.Filter {
|
||||
|
||||
41
internal/taskManager/compressionService.go
Normal file
41
internal/taskManager/compressionService.go
Normal file
@@ -0,0 +1,41 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
func RegisterCompressionService(compressOlderThan int) {
|
||||
log.Info("Register compression service")
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(05, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
var jobs []*schema.Job
|
||||
var err error
|
||||
|
||||
ar := archive.GetHandle()
|
||||
startTime := time.Now().Unix() - int64(compressOlderThan*24*3600)
|
||||
lastTime := ar.CompressLast(startTime)
|
||||
if startTime == lastTime {
|
||||
log.Info("Compression Service - Complete archive run")
|
||||
jobs, err = jobRepo.FindJobsBetween(0, startTime)
|
||||
|
||||
} else {
|
||||
jobs, err = jobRepo.FindJobsBetween(lastTime, startTime)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for compression jobs: %v", err)
|
||||
}
|
||||
ar.Compress(jobs)
|
||||
}))
|
||||
}
|
||||
36
internal/taskManager/ldapSyncService.go
Normal file
36
internal/taskManager/ldapSyncService.go
Normal file
@@ -0,0 +1,36 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
func RegisterLdapSyncService(ds string) {
|
||||
interval, err := parseDuration(ds)
|
||||
if err != nil {
|
||||
log.Warnf("Could not parse duration for sync interval: %v",
|
||||
ds)
|
||||
return
|
||||
}
|
||||
|
||||
auth := auth.GetAuthInstance()
|
||||
|
||||
log.Info("Register LDAP sync service")
|
||||
s.NewJob(gocron.DurationJob(interval),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
t := time.Now()
|
||||
log.Printf("ldap sync started at %s", t.Format(time.RFC3339))
|
||||
if err := auth.LdapAuth.Sync(); err != nil {
|
||||
log.Errorf("ldap sync failed: %s", err.Error())
|
||||
}
|
||||
log.Print("ldap sync done")
|
||||
}))
|
||||
}
|
||||
67
internal/taskManager/retentionService.go
Normal file
67
internal/taskManager/retentionService.go
Normal file
@@ -0,0 +1,67 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
func RegisterRetentionDeleteService(age int, includeDB bool) {
|
||||
log.Info("Register retention delete service")
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(04, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
startTime := time.Now().Unix() - int64(age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().CleanUp(jobs)
|
||||
|
||||
if includeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime)
|
||||
if err != nil {
|
||||
log.Errorf("Error while deleting retention jobs from db: %s", err.Error())
|
||||
} else {
|
||||
log.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
log.Errorf("Error occured in db optimization: %s", err.Error())
|
||||
}
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
func RegisterRetentionMoveService(age int, includeDB bool, location string) {
|
||||
log.Info("Register retention move service")
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(04, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
startTime := time.Now().Unix() - int64(age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().Move(jobs, location)
|
||||
|
||||
if includeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime)
|
||||
if err != nil {
|
||||
log.Errorf("Error while deleting retention jobs from db: %v", err)
|
||||
} else {
|
||||
log.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
log.Errorf("Error occured in db optimization: %v", err)
|
||||
}
|
||||
}
|
||||
}))
|
||||
}
|
||||
27
internal/taskManager/stopJobsExceedTime.go
Normal file
27
internal/taskManager/stopJobsExceedTime.go
Normal file
@@ -0,0 +1,27 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
func RegisterStopJobsExceedTime() {
|
||||
log.Info("Register undead jobs service")
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(03, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
err := jobRepo.StopJobsExceedingWalltimeBy(config.Keys.StopJobsExceedingWalltime)
|
||||
if err != nil {
|
||||
log.Warnf("Error while looking for jobs exceeding their walltime: %s", err.Error())
|
||||
}
|
||||
runtime.GC()
|
||||
}))
|
||||
}
|
||||
90
internal/taskManager/taskManager.go
Normal file
90
internal/taskManager/taskManager.go
Normal file
@@ -0,0 +1,90 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
var (
|
||||
s gocron.Scheduler
|
||||
jobRepo *repository.JobRepository
|
||||
)
|
||||
|
||||
func parseDuration(s string) (time.Duration, error) {
|
||||
interval, err := time.ParseDuration(s)
|
||||
if err != nil {
|
||||
log.Warnf("Could not parse duration for sync interval: %v",
|
||||
s)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if interval == 0 {
|
||||
log.Info("TaskManager: Sync interval is zero")
|
||||
}
|
||||
|
||||
return interval, nil
|
||||
}
|
||||
|
||||
func Start() {
|
||||
var err error
|
||||
jobRepo = repository.GetJobRepository()
|
||||
s, err = gocron.NewScheduler()
|
||||
if err != nil {
|
||||
log.Fatalf("Error while creating gocron scheduler: %s", err.Error())
|
||||
}
|
||||
|
||||
if config.Keys.StopJobsExceedingWalltime > 0 {
|
||||
RegisterStopJobsExceedTime()
|
||||
}
|
||||
|
||||
var cfg struct {
|
||||
Retention schema.Retention `json:"retention"`
|
||||
Compression int `json:"compression"`
|
||||
}
|
||||
cfg.Retention.IncludeDB = true
|
||||
|
||||
if err := json.Unmarshal(config.Keys.Archive, &cfg); err != nil {
|
||||
log.Warn("Error while unmarshaling raw config json")
|
||||
}
|
||||
|
||||
switch cfg.Retention.Policy {
|
||||
case "delete":
|
||||
RegisterRetentionDeleteService(
|
||||
cfg.Retention.Age,
|
||||
cfg.Retention.IncludeDB)
|
||||
case "move":
|
||||
RegisterRetentionMoveService(
|
||||
cfg.Retention.Age,
|
||||
cfg.Retention.IncludeDB,
|
||||
cfg.Retention.Location)
|
||||
}
|
||||
|
||||
if cfg.Compression > 0 {
|
||||
RegisterCompressionService(cfg.Compression)
|
||||
}
|
||||
|
||||
lc := config.Keys.LdapConfig
|
||||
|
||||
if lc != nil && lc.SyncInterval != "" {
|
||||
RegisterLdapSyncService(lc.SyncInterval)
|
||||
}
|
||||
|
||||
RegisterFootprintWorker()
|
||||
RegisterUpdateDurationWorker()
|
||||
|
||||
s.Start()
|
||||
}
|
||||
|
||||
func Shutdown() {
|
||||
s.Shutdown()
|
||||
}
|
||||
33
internal/taskManager/updateDurationService.go
Normal file
33
internal/taskManager/updateDurationService.go
Normal file
@@ -0,0 +1,33 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
func RegisterUpdateDurationWorker() {
|
||||
var frequency string
|
||||
if config.Keys.CronFrequency != nil && config.Keys.CronFrequency.DurationWorker != "" {
|
||||
frequency = config.Keys.CronFrequency.DurationWorker
|
||||
} else {
|
||||
frequency = "5m"
|
||||
}
|
||||
d, _ := time.ParseDuration(frequency)
|
||||
log.Infof("Register Duration Update service with %s interval", frequency)
|
||||
|
||||
s.NewJob(gocron.DurationJob(d),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
start := time.Now()
|
||||
log.Printf("Update duration started at %s", start.Format(time.RFC3339))
|
||||
jobRepo.UpdateDuration()
|
||||
log.Printf("Update duration is done and took %s", time.Since(start))
|
||||
}))
|
||||
}
|
||||
146
internal/taskManager/updateFootprintService.go
Normal file
146
internal/taskManager/updateFootprintService.go
Normal file
@@ -0,0 +1,146 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
func RegisterFootprintWorker() {
|
||||
var frequency string
|
||||
if config.Keys.CronFrequency != nil && config.Keys.CronFrequency.FootprintWorker != "" {
|
||||
frequency = config.Keys.CronFrequency.FootprintWorker
|
||||
} else {
|
||||
frequency = "10m"
|
||||
}
|
||||
d, _ := time.ParseDuration(frequency)
|
||||
log.Infof("Register Footprint Update service with %s interval", frequency)
|
||||
|
||||
s.NewJob(gocron.DurationJob(d),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
s := time.Now()
|
||||
c := 0
|
||||
ce := 0
|
||||
cl := 0
|
||||
log.Printf("Update Footprints started at %s", s.Format(time.RFC3339))
|
||||
|
||||
for _, cluster := range archive.Clusters {
|
||||
s_cluster := time.Now()
|
||||
jobs, err := jobRepo.FindRunningJobs(cluster.Name)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// NOTE: Additional Subcluster Loop Could Allow For Limited List Of Footprint-Metrics Only.
|
||||
// - Chunk-Size Would Then Be 'SubCluster' (Running Jobs, Transactions) as Lists Can Change Within SCs
|
||||
// - Would Require Review of 'updateFootprint' Usage (Logic Could Possibly Be Included Here Completely)
|
||||
allMetrics := make([]string, 0)
|
||||
metricConfigs := archive.GetCluster(cluster.Name).MetricConfig
|
||||
for _, mc := range metricConfigs {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(cluster.Name)
|
||||
if err != nil {
|
||||
log.Errorf("no metric data repository configured for '%s'", cluster.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
pendingStatements := []sq.UpdateBuilder{}
|
||||
|
||||
for _, job := range jobs {
|
||||
log.Debugf("Prepare job %d", job.JobID)
|
||||
cl++
|
||||
|
||||
s_job := time.Now()
|
||||
|
||||
jobStats, err := repo.LoadStats(job, allMetrics, context.Background())
|
||||
if err != nil {
|
||||
log.Errorf("error wile loading job data stats for footprint update: %v", err)
|
||||
ce++
|
||||
continue
|
||||
}
|
||||
|
||||
jobMeta := &schema.JobMeta{
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
Statistics: make(map[string]schema.JobStatistics),
|
||||
}
|
||||
|
||||
for _, metric := range allMetrics {
|
||||
avg, min, max := 0.0, 0.0, 0.0
|
||||
data, ok := jobStats[metric] // JobStats[Metric1:[Hostname1:[Stats], Hostname2:[Stats], ...], Metric2[...] ...]
|
||||
if ok {
|
||||
for _, res := range job.Resources {
|
||||
hostStats, ok := data[res.Hostname]
|
||||
if ok {
|
||||
avg += hostStats.Avg
|
||||
min = math.Min(min, hostStats.Min)
|
||||
max = math.Max(max, hostStats.Max)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Add values rounded to 2 digits: repo.LoadStats may return unrounded
|
||||
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: schema.Unit{
|
||||
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||
},
|
||||
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
|
||||
Min: (math.Round(min*100) / 100),
|
||||
Max: (math.Round(max*100) / 100),
|
||||
}
|
||||
}
|
||||
|
||||
// Build Statement per Job, Add to Pending Array
|
||||
stmt := sq.Update("job")
|
||||
stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta)
|
||||
if err != nil {
|
||||
log.Errorf("update job (dbid: %d) statement build failed at footprint step: %s", job.ID, err.Error())
|
||||
ce++
|
||||
continue
|
||||
}
|
||||
stmt = stmt.Where("job.id = ?", job.ID)
|
||||
|
||||
pendingStatements = append(pendingStatements, stmt)
|
||||
log.Debugf("Job %d took %s", job.JobID, time.Since(s_job))
|
||||
}
|
||||
|
||||
t, err := jobRepo.TransactionInit()
|
||||
if err != nil {
|
||||
log.Errorf("failed TransactionInit %v", err)
|
||||
log.Errorf("skipped %d transactions for cluster %s", len(pendingStatements), cluster.Name)
|
||||
ce += len(pendingStatements)
|
||||
} else {
|
||||
for _, ps := range pendingStatements {
|
||||
query, args, err := ps.ToSql()
|
||||
if err != nil {
|
||||
log.Errorf("failed in ToSQL conversion: %v", err)
|
||||
ce++
|
||||
} else {
|
||||
// args...: Footprint-JSON, Energyfootprint-JSON, TotalEnergy, JobID
|
||||
jobRepo.TransactionAdd(t, query, args...)
|
||||
c++
|
||||
}
|
||||
}
|
||||
jobRepo.TransactionEnd(t)
|
||||
}
|
||||
log.Debugf("Finish Cluster %s, took %s", cluster.Name, time.Since(s_cluster))
|
||||
}
|
||||
log.Printf("Updating %d (of %d; Skipped %d) Footprints is done and took %s", c, cl, ce, time.Since(s))
|
||||
}))
|
||||
}
|
||||
@@ -4,7 +4,13 @@
|
||||
// license that can be found in the LICENSE file.
|
||||
package util
|
||||
|
||||
import "golang.org/x/exp/constraints"
|
||||
import (
|
||||
"golang.org/x/exp/constraints"
|
||||
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
)
|
||||
|
||||
func Min[T constraints.Ordered](a, b T) T {
|
||||
if a < b {
|
||||
@@ -19,3 +25,36 @@ func Max[T constraints.Ordered](a, b T) T {
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func sortedCopy(input []float64) []float64 {
|
||||
sorted := make([]float64, len(input))
|
||||
copy(sorted, input)
|
||||
sort.Float64s(sorted)
|
||||
return sorted
|
||||
}
|
||||
|
||||
func Mean(input []float64) (float64, error) {
|
||||
if len(input) == 0 {
|
||||
return math.NaN(), fmt.Errorf("input array is empty: %#v", input)
|
||||
}
|
||||
sum := 0.0
|
||||
for _, n := range input {
|
||||
sum += n
|
||||
}
|
||||
return sum / float64(len(input)), nil
|
||||
}
|
||||
|
||||
func Median(input []float64) (median float64, err error) {
|
||||
c := sortedCopy(input)
|
||||
// Even numbers: add the two middle numbers, divide by two (use mean function)
|
||||
// Odd numbers: Use the middle number
|
||||
l := len(c)
|
||||
if l == 0 {
|
||||
return math.NaN(), fmt.Errorf("input array is empty: %#v", input)
|
||||
} else if l%2 == 0 {
|
||||
median, _ = Mean(c[l/2-1 : l/2+1])
|
||||
} else {
|
||||
median = c[l/2]
|
||||
}
|
||||
return median, nil
|
||||
}
|
||||
|
||||
@@ -7,13 +7,14 @@ package archive
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
const Version uint64 = 1
|
||||
const Version uint64 = 2
|
||||
|
||||
type ArchiveBackend interface {
|
||||
Init(rawConfig json.RawMessage) (uint64, error)
|
||||
@@ -53,40 +54,48 @@ type JobContainer struct {
|
||||
}
|
||||
|
||||
var (
|
||||
initOnce sync.Once
|
||||
cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
ar ArchiveBackend
|
||||
useArchive bool
|
||||
)
|
||||
|
||||
func Init(rawConfig json.RawMessage, disableArchive bool) error {
|
||||
useArchive = !disableArchive
|
||||
var err error
|
||||
|
||||
var cfg struct {
|
||||
Kind string `json:"kind"`
|
||||
}
|
||||
initOnce.Do(func() {
|
||||
useArchive = !disableArchive
|
||||
|
||||
if err := json.Unmarshal(rawConfig, &cfg); err != nil {
|
||||
log.Warn("Error while unmarshaling raw config json")
|
||||
return err
|
||||
}
|
||||
var cfg struct {
|
||||
Kind string `json:"kind"`
|
||||
}
|
||||
|
||||
switch cfg.Kind {
|
||||
case "file":
|
||||
ar = &FsArchive{}
|
||||
// case "s3":
|
||||
// ar = &S3Archive{}
|
||||
default:
|
||||
return fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", cfg.Kind)
|
||||
}
|
||||
if err = json.Unmarshal(rawConfig, &cfg); err != nil {
|
||||
log.Warn("Error while unmarshaling raw config json")
|
||||
return
|
||||
}
|
||||
|
||||
version, err := ar.Init(rawConfig)
|
||||
if err != nil {
|
||||
log.Error("Error while initializing archiveBackend")
|
||||
return err
|
||||
}
|
||||
log.Infof("Load archive version %d", version)
|
||||
switch cfg.Kind {
|
||||
case "file":
|
||||
ar = &FsArchive{}
|
||||
// case "s3":
|
||||
// ar = &S3Archive{}
|
||||
default:
|
||||
err = fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", cfg.Kind)
|
||||
}
|
||||
|
||||
return initClusterConfig()
|
||||
var version uint64
|
||||
version, err = ar.Init(rawConfig)
|
||||
if err != nil {
|
||||
log.Error("Error while initializing archiveBackend")
|
||||
return
|
||||
}
|
||||
log.Infof("Load archive version %d", version)
|
||||
|
||||
err = initClusterConfig()
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func GetHandle() ArchiveBackend {
|
||||
@@ -162,8 +171,9 @@ func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
|
||||
jobMeta.Tags = make([]*schema.Tag, 0)
|
||||
for _, tag := range tags {
|
||||
jobMeta.Tags = append(jobMeta.Tags, &schema.Tag{
|
||||
Name: tag.Name,
|
||||
Type: tag.Type,
|
||||
Name: tag.Name,
|
||||
Type: tag.Type,
|
||||
Scope: tag.Scope,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -12,13 +12,16 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
var Clusters []*schema.Cluster
|
||||
var nodeLists map[string]map[string]NodeList
|
||||
var (
|
||||
Clusters []*schema.Cluster
|
||||
GlobalMetricList []*schema.GlobalMetricListItem
|
||||
NodeLists map[string]map[string]NodeList
|
||||
)
|
||||
|
||||
func initClusterConfig() error {
|
||||
|
||||
Clusters = []*schema.Cluster{}
|
||||
nodeLists = map[string]map[string]NodeList{}
|
||||
NodeLists = map[string]map[string]NodeList{}
|
||||
metricLookup := make(map[string]schema.GlobalMetricListItem)
|
||||
|
||||
for _, c := range ar.GetClusters() {
|
||||
|
||||
@@ -49,11 +52,64 @@ func initClusterConfig() error {
|
||||
if !mc.Scope.Valid() {
|
||||
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
|
||||
}
|
||||
|
||||
ml, ok := metricLookup[mc.Name]
|
||||
if !ok {
|
||||
metricLookup[mc.Name] = schema.GlobalMetricListItem{
|
||||
Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint,
|
||||
}
|
||||
ml = metricLookup[mc.Name]
|
||||
}
|
||||
availability := schema.ClusterSupport{Cluster: cluster.Name}
|
||||
scLookup := make(map[string]*schema.SubClusterConfig)
|
||||
|
||||
for _, scc := range mc.SubClusters {
|
||||
scLookup[scc.Name] = scc
|
||||
}
|
||||
|
||||
for _, sc := range cluster.SubClusters {
|
||||
newMetric := mc
|
||||
newMetric.SubClusters = nil
|
||||
|
||||
if cfg, ok := scLookup[sc.Name]; ok {
|
||||
if !cfg.Remove {
|
||||
availability.SubClusters = append(availability.SubClusters, sc.Name)
|
||||
newMetric.Peak = cfg.Peak
|
||||
newMetric.Normal = cfg.Normal
|
||||
newMetric.Caution = cfg.Caution
|
||||
newMetric.Alert = cfg.Alert
|
||||
newMetric.Footprint = cfg.Footprint
|
||||
newMetric.Energy = cfg.Energy
|
||||
newMetric.LowerIsBetter = cfg.LowerIsBetter
|
||||
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
|
||||
|
||||
if newMetric.Footprint != "" {
|
||||
sc.Footprint = append(sc.Footprint, newMetric.Name)
|
||||
ml.Footprint = newMetric.Footprint
|
||||
}
|
||||
if newMetric.Energy != "" {
|
||||
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
availability.SubClusters = append(availability.SubClusters, sc.Name)
|
||||
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
|
||||
|
||||
if newMetric.Footprint != "" {
|
||||
sc.Footprint = append(sc.Footprint, newMetric.Name)
|
||||
}
|
||||
if newMetric.Energy != "" {
|
||||
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
ml.Availability = append(metricLookup[mc.Name].Availability, availability)
|
||||
metricLookup[mc.Name] = ml
|
||||
}
|
||||
|
||||
Clusters = append(Clusters, cluster)
|
||||
|
||||
nodeLists[cluster.Name] = make(map[string]NodeList)
|
||||
NodeLists[cluster.Name] = make(map[string]NodeList)
|
||||
for _, sc := range cluster.SubClusters {
|
||||
if sc.Nodes == "*" {
|
||||
continue
|
||||
@@ -63,15 +119,18 @@ func initClusterConfig() error {
|
||||
if err != nil {
|
||||
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > in %s/cluster.json: %w", cluster.Name, err)
|
||||
}
|
||||
nodeLists[cluster.Name][sc.Name] = nl
|
||||
NodeLists[cluster.Name][sc.Name] = nl
|
||||
}
|
||||
}
|
||||
|
||||
for _, ml := range metricLookup {
|
||||
GlobalMetricList = append(GlobalMetricList, &ml)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetCluster(cluster string) *schema.Cluster {
|
||||
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
return c
|
||||
@@ -90,11 +149,10 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("Subcluster '%v' not found for cluster '%v', or cluster '%v' not configured!", subcluster, cluster, cluster)
|
||||
return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster)
|
||||
}
|
||||
|
||||
func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
|
||||
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
for _, m := range c.MetricConfig {
|
||||
@@ -110,7 +168,6 @@ func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
|
||||
// AssignSubCluster sets the `job.subcluster` property of the job based
|
||||
// on its cluster and resources.
|
||||
func AssignSubCluster(job *schema.BaseJob) error {
|
||||
|
||||
cluster := GetCluster(job.Cluster)
|
||||
if cluster == nil {
|
||||
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster)
|
||||
@@ -130,7 +187,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
|
||||
}
|
||||
|
||||
host0 := job.Resources[0].Hostname
|
||||
for sc, nl := range nodeLists[job.Cluster] {
|
||||
for sc, nl := range NodeLists[job.Cluster] {
|
||||
if nl != nil && nl.Contains(host0) {
|
||||
job.SubCluster = sc
|
||||
return nil
|
||||
@@ -146,8 +203,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
|
||||
}
|
||||
|
||||
func GetSubClusterByNode(cluster, hostname string) (string, error) {
|
||||
|
||||
for sc, nl := range nodeLists[cluster] {
|
||||
for sc, nl := range NodeLists[cluster] {
|
||||
if nl != nil && nl.Contains(hostname) {
|
||||
return sc, nil
|
||||
}
|
||||
@@ -164,3 +220,13 @@ func GetSubClusterByNode(cluster, hostname string) (string, error) {
|
||||
|
||||
return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", cluster, hostname)
|
||||
}
|
||||
|
||||
func MetricIndex(mc []schema.MetricConfig, name string) (int, error) {
|
||||
for i, m := range mc {
|
||||
if m.Name == name {
|
||||
return i, nil
|
||||
}
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("unknown metric name %s", name)
|
||||
}
|
||||
|
||||
39
pkg/archive/clusterConfig_test.go
Normal file
39
pkg/archive/clusterConfig_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package archive_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
)
|
||||
|
||||
func TestClusterConfig(t *testing.T) {
|
||||
if err := archive.Init(json.RawMessage("{\"kind\": \"file\",\"path\": \"testdata/archive\"}"), false); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
sc, err := archive.GetSubCluster("fritz", "spr1tb")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// spew.Dump(sc.MetricConfig)
|
||||
if len(sc.Footprint) != 3 {
|
||||
t.Fail()
|
||||
}
|
||||
if len(sc.MetricConfig) != 15 {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
for _, metric := range sc.MetricConfig {
|
||||
if metric.LowerIsBetter && metric.Name != "mem_used" {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
|
||||
// spew.Dump(archive.GlobalMetricList)
|
||||
// t.Fail()
|
||||
}
|
||||
@@ -30,6 +30,7 @@ func TestInitNoJson(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInitNotExists(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
_, err := fsa.Init(json.RawMessage("{\"path\":\"testdata/job-archive\"}"))
|
||||
@@ -47,10 +48,10 @@ func TestInit(t *testing.T) {
|
||||
if fsa.path != "testdata/archive" {
|
||||
t.Fail()
|
||||
}
|
||||
if version != 1 {
|
||||
if version != 2 {
|
||||
t.Fail()
|
||||
}
|
||||
if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" {
|
||||
if len(fsa.clusters) != 3 || fsa.clusters[1] != "emmy" {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
@@ -133,7 +134,6 @@ func TestLoadJobData(t *testing.T) {
|
||||
}
|
||||
|
||||
func BenchmarkLoadJobData(b *testing.B) {
|
||||
|
||||
tmpdir := b.TempDir()
|
||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||
util.CopyDir("./testdata/archive/", jobarchive)
|
||||
@@ -157,7 +157,6 @@ func BenchmarkLoadJobData(b *testing.B) {
|
||||
}
|
||||
|
||||
func BenchmarkLoadJobDataCompressed(b *testing.B) {
|
||||
|
||||
tmpdir := b.TempDir()
|
||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||
util.CopyDir("./testdata/archive/", jobarchive)
|
||||
|
||||
@@ -9,8 +9,8 @@ import (
|
||||
"io"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
func DecodeJobData(r io.Reader, k string) (schema.JobData, error) {
|
||||
|
||||
2772
pkg/archive/testdata/archive/alex/cluster.json
vendored
Normal file
2772
pkg/archive/testdata/archive/alex/cluster.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2246
pkg/archive/testdata/archive/fritz/cluster.json
vendored
Normal file
2246
pkg/archive/testdata/archive/fritz/cluster.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2
pkg/archive/testdata/archive/version.txt
vendored
2
pkg/archive/testdata/archive/version.txt
vendored
@@ -1 +1 @@
|
||||
1
|
||||
2
|
||||
|
||||
123
pkg/resampler/resampler.go
Normal file
123
pkg/resampler/resampler.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package resampler
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
func SimpleResampler(data []schema.Float, old_frequency int64, new_frequency int64) ([]schema.Float, int64, error) {
|
||||
if old_frequency == 0 || new_frequency == 0 || new_frequency <= old_frequency {
|
||||
return data, old_frequency, nil
|
||||
}
|
||||
|
||||
if new_frequency%old_frequency != 0 {
|
||||
return nil, 0, errors.New("new sampling frequency should be multiple of the old frequency")
|
||||
}
|
||||
|
||||
var step int = int(new_frequency / old_frequency)
|
||||
var new_data_length = len(data) / step
|
||||
|
||||
if new_data_length == 0 || len(data) < 100 || new_data_length >= len(data) {
|
||||
return data, old_frequency, nil
|
||||
}
|
||||
|
||||
new_data := make([]schema.Float, new_data_length)
|
||||
|
||||
for i := 0; i < new_data_length; i++ {
|
||||
new_data[i] = data[i*step]
|
||||
}
|
||||
|
||||
return new_data, new_frequency, nil
|
||||
}
|
||||
|
||||
// Inspired by one of the algorithms from https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf
|
||||
// Adapted from https://github.com/haoel/downsampling/blob/master/core/lttb.go
|
||||
func LargestTriangleThreeBucket(data []schema.Float, old_frequency int, new_frequency int) ([]schema.Float, int, error) {
|
||||
|
||||
if old_frequency == 0 || new_frequency == 0 || new_frequency <= old_frequency {
|
||||
return data, old_frequency, nil
|
||||
}
|
||||
|
||||
if new_frequency%old_frequency != 0 {
|
||||
return nil, 0, errors.New(fmt.Sprintf("new sampling frequency : %d should be multiple of the old frequency : %d", new_frequency, old_frequency))
|
||||
}
|
||||
|
||||
var step int = int(new_frequency / old_frequency)
|
||||
var new_data_length = len(data) / step
|
||||
|
||||
if new_data_length == 0 || len(data) < 100 || new_data_length >= len(data) {
|
||||
return data, old_frequency, nil
|
||||
}
|
||||
|
||||
new_data := make([]schema.Float, 0, new_data_length)
|
||||
|
||||
// Bucket size. Leave room for start and end data points
|
||||
bucketSize := float64(len(data)-2) / float64(new_data_length-2)
|
||||
|
||||
new_data = append(new_data, data[0]) // Always add the first point
|
||||
|
||||
// We have 3 pointers represent for
|
||||
// > bucketLow - the current bucket's beginning location
|
||||
// > bucketMiddle - the current bucket's ending location,
|
||||
// also the beginning location of next bucket
|
||||
// > bucketHight - the next bucket's ending location.
|
||||
bucketLow := 1
|
||||
bucketMiddle := int(math.Floor(bucketSize)) + 1
|
||||
|
||||
var prevMaxAreaPoint int
|
||||
|
||||
for i := 0; i < new_data_length-2; i++ {
|
||||
|
||||
bucketHigh := int(math.Floor(float64(i+2)*bucketSize)) + 1
|
||||
if bucketHigh >= len(data)-1 {
|
||||
bucketHigh = len(data) - 2
|
||||
}
|
||||
|
||||
// Calculate point average for next bucket (containing c)
|
||||
avgPointX, avgPointY := calculateAverageDataPoint(data[bucketMiddle:bucketHigh+1], int64(bucketMiddle))
|
||||
|
||||
// Get the range for current bucket
|
||||
currBucketStart := bucketLow
|
||||
currBucketEnd := bucketMiddle
|
||||
|
||||
// Point a
|
||||
pointX := prevMaxAreaPoint
|
||||
pointY := data[prevMaxAreaPoint]
|
||||
|
||||
maxArea := -1.0
|
||||
|
||||
var maxAreaPoint int
|
||||
flag_ := 0
|
||||
for ; currBucketStart < currBucketEnd; currBucketStart++ {
|
||||
|
||||
area := calculateTriangleArea(schema.Float(pointX), pointY, avgPointX, avgPointY, schema.Float(currBucketStart), data[currBucketStart])
|
||||
if area > maxArea {
|
||||
maxArea = area
|
||||
maxAreaPoint = currBucketStart
|
||||
}
|
||||
if math.IsNaN(float64(avgPointY)) {
|
||||
flag_ = 1
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if flag_ == 1 {
|
||||
new_data = append(new_data, schema.NaN) // Pick this point from the bucket
|
||||
|
||||
} else {
|
||||
new_data = append(new_data, data[maxAreaPoint]) // Pick this point from the bucket
|
||||
}
|
||||
prevMaxAreaPoint = maxAreaPoint // This MaxArea point is the next's prevMAxAreaPoint
|
||||
|
||||
//move to the next window
|
||||
bucketLow = bucketMiddle
|
||||
bucketMiddle = bucketHigh
|
||||
}
|
||||
|
||||
new_data = append(new_data, data[len(data)-1]) // Always add last
|
||||
|
||||
return new_data, new_frequency, nil
|
||||
}
|
||||
35
pkg/resampler/util.go
Normal file
35
pkg/resampler/util.go
Normal file
@@ -0,0 +1,35 @@
|
||||
package resampler
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
func calculateTriangleArea(paX, paY, pbX, pbY, pcX, pcY schema.Float) float64 {
|
||||
area := ((paX-pcX)*(pbY-paY) - (paX-pbX)*(pcY-paY)) * 0.5
|
||||
return math.Abs(float64(area))
|
||||
}
|
||||
|
||||
func calculateAverageDataPoint(points []schema.Float, xStart int64) (avgX schema.Float, avgY schema.Float) {
|
||||
flag := 0
|
||||
for _, point := range points {
|
||||
avgX += schema.Float(xStart)
|
||||
avgY += point
|
||||
xStart++
|
||||
if math.IsNaN(float64(point)) {
|
||||
flag = 1
|
||||
}
|
||||
}
|
||||
|
||||
l := schema.Float(len(points))
|
||||
|
||||
avgX /= l
|
||||
avgY /= l
|
||||
|
||||
if flag == 1 {
|
||||
return avgX, schema.NaN
|
||||
} else {
|
||||
return avgX, avgY
|
||||
}
|
||||
}
|
||||
@@ -30,38 +30,47 @@ type MetricValue struct {
|
||||
}
|
||||
|
||||
type SubCluster struct {
|
||||
Name string `json:"name"`
|
||||
Nodes string `json:"nodes"`
|
||||
ProcessorType string `json:"processorType"`
|
||||
SocketsPerNode int `json:"socketsPerNode"`
|
||||
CoresPerSocket int `json:"coresPerSocket"`
|
||||
ThreadsPerCore int `json:"threadsPerCore"`
|
||||
FlopRateScalar MetricValue `json:"flopRateScalar"`
|
||||
FlopRateSimd MetricValue `json:"flopRateSimd"`
|
||||
MemoryBandwidth MetricValue `json:"memoryBandwidth"`
|
||||
Topology Topology `json:"topology"`
|
||||
Name string `json:"name"`
|
||||
Nodes string `json:"nodes"`
|
||||
ProcessorType string `json:"processorType"`
|
||||
Topology Topology `json:"topology"`
|
||||
FlopRateScalar MetricValue `json:"flopRateScalar"`
|
||||
FlopRateSimd MetricValue `json:"flopRateSimd"`
|
||||
MemoryBandwidth MetricValue `json:"memoryBandwidth"`
|
||||
MetricConfig []MetricConfig `json:"metricConfig,omitempty"`
|
||||
Footprint []string `json:"footprint,omitempty"`
|
||||
EnergyFootprint []string `json:"energyFootprint,omitempty"`
|
||||
SocketsPerNode int `json:"socketsPerNode"`
|
||||
CoresPerSocket int `json:"coresPerSocket"`
|
||||
ThreadsPerCore int `json:"threadsPerCore"`
|
||||
}
|
||||
|
||||
type SubClusterConfig struct {
|
||||
Name string `json:"name"`
|
||||
Peak float64 `json:"peak"`
|
||||
Normal float64 `json:"normal"`
|
||||
Caution float64 `json:"caution"`
|
||||
Alert float64 `json:"alert"`
|
||||
Remove bool `json:"remove"`
|
||||
Name string `json:"name"`
|
||||
Footprint string `json:"footprint,omitempty"`
|
||||
Energy string `json:"energy"`
|
||||
Peak float64 `json:"peak"`
|
||||
Normal float64 `json:"normal"`
|
||||
Caution float64 `json:"caution"`
|
||||
Alert float64 `json:"alert"`
|
||||
Remove bool `json:"remove"`
|
||||
LowerIsBetter bool `json:"lowerIsBetter"`
|
||||
}
|
||||
|
||||
type MetricConfig struct {
|
||||
Name string `json:"name"`
|
||||
Unit Unit `json:"unit"`
|
||||
Scope MetricScope `json:"scope"`
|
||||
Aggregation string `json:"aggregation"`
|
||||
Timestep int `json:"timestep"`
|
||||
Peak float64 `json:"peak"`
|
||||
Normal float64 `json:"normal"`
|
||||
Caution float64 `json:"caution"`
|
||||
Alert float64 `json:"alert"`
|
||||
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
|
||||
Unit Unit `json:"unit"`
|
||||
Energy string `json:"energy"`
|
||||
Name string `json:"name"`
|
||||
Scope MetricScope `json:"scope"`
|
||||
Aggregation string `json:"aggregation"`
|
||||
Footprint string `json:"footprint,omitempty"`
|
||||
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
|
||||
Peak float64 `json:"peak"`
|
||||
Caution float64 `json:"caution"`
|
||||
Alert float64 `json:"alert"`
|
||||
Timestep int `json:"timestep"`
|
||||
Normal float64 `json:"normal"`
|
||||
LowerIsBetter bool `json:"lowerIsBetter"`
|
||||
}
|
||||
|
||||
type Cluster struct {
|
||||
@@ -70,14 +79,27 @@ type Cluster struct {
|
||||
SubClusters []*SubCluster `json:"subClusters"`
|
||||
}
|
||||
|
||||
type ClusterSupport struct {
|
||||
Cluster string `json:"cluster"`
|
||||
SubClusters []string `json:"subclusters"`
|
||||
}
|
||||
|
||||
type GlobalMetricListItem struct {
|
||||
Name string `json:"name"`
|
||||
Unit Unit `json:"unit"`
|
||||
Scope MetricScope `json:"scope"`
|
||||
Footprint string `json:"footprint,omitempty"`
|
||||
Availability []ClusterSupport `json:"availability"`
|
||||
}
|
||||
|
||||
// Return a list of socket IDs given a list of hwthread IDs. Even if just one
|
||||
// hwthread is in that socket, add it to the list. If no hwthreads other than
|
||||
// those in the argument list are assigned to one of the sockets in the first
|
||||
// return value, return true as the second value. TODO: Optimize this, there
|
||||
// must be a more efficient way/algorithm.
|
||||
func (topo *Topology) GetSocketsFromHWThreads(
|
||||
hwthreads []int) (sockets []int, exclusive bool) {
|
||||
|
||||
hwthreads []int,
|
||||
) (sockets []int, exclusive bool) {
|
||||
socketsMap := map[int]int{}
|
||||
for _, hwthread := range hwthreads {
|
||||
for socket, hwthreadsInSocket := range topo.Socket {
|
||||
@@ -100,14 +122,46 @@ func (topo *Topology) GetSocketsFromHWThreads(
|
||||
return sockets, exclusive
|
||||
}
|
||||
|
||||
// Return a list of socket IDs given a list of core IDs. Even if just one
|
||||
// core is in that socket, add it to the list. If no cores other than
|
||||
// those in the argument list are assigned to one of the sockets in the first
|
||||
// return value, return true as the second value. TODO: Optimize this, there
|
||||
// must be a more efficient way/algorithm.
|
||||
func (topo *Topology) GetSocketsFromCores (
|
||||
cores []int,
|
||||
) (sockets []int, exclusive bool) {
|
||||
socketsMap := map[int]int{}
|
||||
for _, core := range cores {
|
||||
for _, hwthreadInCore := range topo.Core[core] {
|
||||
for socket, hwthreadsInSocket := range topo.Socket {
|
||||
for _, hwthreadInSocket := range hwthreadsInSocket {
|
||||
if hwthreadInCore == hwthreadInSocket {
|
||||
socketsMap[socket] += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exclusive = true
|
||||
hwthreadsPerSocket := len(topo.Node) / len(topo.Socket)
|
||||
sockets = make([]int, 0, len(socketsMap))
|
||||
for socket, count := range socketsMap {
|
||||
sockets = append(sockets, socket)
|
||||
exclusive = exclusive && count == hwthreadsPerSocket
|
||||
}
|
||||
|
||||
return sockets, exclusive
|
||||
}
|
||||
|
||||
// Return a list of core IDs given a list of hwthread IDs. Even if just one
|
||||
// hwthread is in that core, add it to the list. If no hwthreads other than
|
||||
// those in the argument list are assigned to one of the cores in the first
|
||||
// return value, return true as the second value. TODO: Optimize this, there
|
||||
// must be a more efficient way/algorithm.
|
||||
func (topo *Topology) GetCoresFromHWThreads(
|
||||
hwthreads []int) (cores []int, exclusive bool) {
|
||||
|
||||
hwthreads []int,
|
||||
) (cores []int, exclusive bool) {
|
||||
coresMap := map[int]int{}
|
||||
for _, hwthread := range hwthreads {
|
||||
for core, hwthreadsInCore := range topo.Core {
|
||||
@@ -136,8 +190,8 @@ func (topo *Topology) GetCoresFromHWThreads(
|
||||
// memory domains in the first return value, return true as the second value.
|
||||
// TODO: Optimize this, there must be a more efficient way/algorithm.
|
||||
func (topo *Topology) GetMemoryDomainsFromHWThreads(
|
||||
hwthreads []int) (memDoms []int, exclusive bool) {
|
||||
|
||||
hwthreads []int,
|
||||
) (memDoms []int, exclusive bool) {
|
||||
memDomsMap := map[int]int{}
|
||||
for _, hwthread := range hwthreads {
|
||||
for memDom, hwthreadsInmemDom := range topo.MemoryDomain {
|
||||
@@ -172,7 +226,17 @@ func (topo *Topology) GetAcceleratorID(id int) (string, error) {
|
||||
}
|
||||
}
|
||||
|
||||
func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
|
||||
// Return list of hardware (string) accelerator IDs
|
||||
func (topo *Topology) GetAcceleratorIDs() []string {
|
||||
accels := make([]string, 0)
|
||||
for _, accel := range topo.Accelerators {
|
||||
accels = append(accels, accel.ID)
|
||||
}
|
||||
return accels
|
||||
}
|
||||
|
||||
// Outdated? Or: Return indices of accelerators in parent array?
|
||||
func (topo *Topology) GetAcceleratorIDsAsInt() ([]int, error) {
|
||||
accels := make([]int, 0)
|
||||
for _, accel := range topo.Accelerators {
|
||||
id, err := strconv.Atoi(accel.ID)
|
||||
|
||||
@@ -24,8 +24,9 @@ type LdapConfig struct {
|
||||
}
|
||||
|
||||
type OpenIDConfig struct {
|
||||
Provider string `json:"provider"`
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
Provider string `json:"provider"`
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
}
|
||||
|
||||
type JWTAuthConfig struct {
|
||||
@@ -45,6 +46,9 @@ type JWTAuthConfig struct {
|
||||
|
||||
// Should an non-existent user be added to the DB based on the information in the token
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
|
||||
// Should an existent user be updated in the DB based on the information in the token
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
}
|
||||
|
||||
type IntRange struct {
|
||||
@@ -53,8 +57,9 @@ type IntRange struct {
|
||||
}
|
||||
|
||||
type TimeRange struct {
|
||||
From *time.Time `json:"from"`
|
||||
To *time.Time `json:"to"`
|
||||
From *time.Time `json:"from"`
|
||||
To *time.Time `json:"to"`
|
||||
Range string `json:"range,omitempty"`
|
||||
}
|
||||
|
||||
type FilterRanges struct {
|
||||
@@ -76,6 +81,20 @@ type Retention struct {
|
||||
IncludeDB bool `json:"includeDB"`
|
||||
}
|
||||
|
||||
type ResampleConfig struct {
|
||||
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
|
||||
Resolutions []int `json:"resolutions"`
|
||||
// Trigger next zoom level at less than this many visible datapoints
|
||||
Trigger int `json:"trigger"`
|
||||
}
|
||||
|
||||
type CronFrequency struct {
|
||||
// Duration Update Worker [Defaults to '5m']
|
||||
DurationWorker string `json:"duration-worker"`
|
||||
// Metric-Footprint Update Worker [Defaults to '10m']
|
||||
FootprintWorker string `json:"footprint-worker"`
|
||||
}
|
||||
|
||||
// Format of the configuration (file). See below for the defaults.
|
||||
type ProgramConfig struct {
|
||||
// Address where the http (or https) server will listen on (for example: 'localhost:80').
|
||||
@@ -133,6 +152,9 @@ type ProgramConfig struct {
|
||||
// be provided! Most options here can be overwritten by the user.
|
||||
UiDefaults map[string]interface{} `json:"ui-defaults"`
|
||||
|
||||
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
|
||||
EnableResampling *ResampleConfig `json:"enable-resampling"`
|
||||
|
||||
// Where to store MachineState files
|
||||
MachineStateDir string `json:"machine-state-dir"`
|
||||
|
||||
@@ -142,6 +164,13 @@ type ProgramConfig struct {
|
||||
// Defines time X in seconds in which jobs are considered to be "short" and will be filtered in specific views.
|
||||
ShortRunningJobsDuration int `json:"short-running-jobs-duration"`
|
||||
|
||||
// Energy Mix CO2 Emission Constant [g/kWh]
|
||||
// If entered, displays estimated CO2 emission for job based on jobs totalEnergy
|
||||
EmissionConstant int `json:"emission-constant"`
|
||||
|
||||
// Frequency of cron job workers
|
||||
CronFrequency *CronFrequency `json:"cron-frequency"`
|
||||
|
||||
// Array of Clusters
|
||||
Clusters []*ClusterConfig `json:"clusters"`
|
||||
}
|
||||
|
||||
@@ -16,30 +16,33 @@ import (
|
||||
// Common subset of Job and JobMeta. Use one of those, not this type directly.
|
||||
|
||||
type BaseJob struct {
|
||||
// The unique identifier of a job
|
||||
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
|
||||
User string `json:"user" db:"user" example:"abcd100h"` // The unique identifier of a user
|
||||
Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project
|
||||
Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster
|
||||
SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster
|
||||
Partition string `json:"partition,omitempty" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
|
||||
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` // The unique identifier of an array job
|
||||
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0)
|
||||
// NumCores int32 `json:"numCores" db:"num_cores" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
|
||||
NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
|
||||
NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
|
||||
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user
|
||||
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
|
||||
SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` // SMT threads used by job
|
||||
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job
|
||||
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0)
|
||||
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
|
||||
Tags []*Tag `json:"tags,omitempty"` // List of tags
|
||||
RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes]
|
||||
Resources []*Resource `json:"resources"` // Resources used by job
|
||||
RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes]
|
||||
MetaData map[string]string `json:"metaData"` // Additional information about the job
|
||||
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
|
||||
Cluster string `json:"cluster" db:"cluster" example:"fritz"`
|
||||
SubCluster string `json:"subCluster" db:"subcluster" example:"main"`
|
||||
Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"`
|
||||
Project string `json:"project" db:"project" example:"abcd200"`
|
||||
User string `json:"user" db:"hpc_user" example:"abcd100h"`
|
||||
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"`
|
||||
Tags []*Tag `json:"tags,omitempty"`
|
||||
RawEnergyFootprint []byte `json:"-" db:"energy_footprint"`
|
||||
RawFootprint []byte `json:"-" db:"footprint"`
|
||||
RawMetaData []byte `json:"-" db:"meta_data"`
|
||||
RawResources []byte `json:"-" db:"resources"`
|
||||
Resources []*Resource `json:"resources"`
|
||||
EnergyFootprint map[string]float64 `json:"energyFootprint"`
|
||||
Footprint map[string]float64 `json:"footprint"`
|
||||
MetaData map[string]string `json:"metaData"`
|
||||
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
|
||||
Energy float64 `json:"energy" db:"energy"`
|
||||
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"`
|
||||
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"`
|
||||
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
|
||||
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"`
|
||||
SMT int32 `json:"smt,omitempty" db:"smt" example:"4"`
|
||||
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"`
|
||||
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"`
|
||||
NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"`
|
||||
NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"`
|
||||
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"`
|
||||
}
|
||||
|
||||
// Job struct type
|
||||
@@ -49,19 +52,10 @@ type BaseJob struct {
|
||||
// Job model
|
||||
// @Description Information of a HPC job.
|
||||
type Job struct {
|
||||
// The unique identifier of a job in the database
|
||||
ID int64 `json:"id" db:"id"`
|
||||
StartTime time.Time `json:"startTime"`
|
||||
BaseJob
|
||||
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds
|
||||
StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type
|
||||
MemUsedMax float64 `json:"memUsedMax" db:"mem_used_max"` // MemUsedMax as Float64
|
||||
FlopsAnyAvg float64 `json:"flopsAnyAvg" db:"flops_any_avg"` // FlopsAnyAvg as Float64
|
||||
MemBwAvg float64 `json:"memBwAvg" db:"mem_bw_avg"` // MemBwAvg as Float64
|
||||
LoadAvg float64 `json:"loadAvg" db:"load_avg"` // LoadAvg as Float64
|
||||
NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64
|
||||
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64
|
||||
FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64
|
||||
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"` // FileDataVolTotal as Float64
|
||||
ID int64 `json:"id" db:"id"`
|
||||
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"`
|
||||
}
|
||||
|
||||
// JobMeta struct type
|
||||
@@ -88,11 +82,10 @@ type JobLinkResultList struct {
|
||||
// JobMeta model
|
||||
// @Description Meta data information of a HPC job.
|
||||
type JobMeta struct {
|
||||
// The unique identifier of a job in the database
|
||||
ID *int64 `json:"id,omitempty"`
|
||||
ID *int64 `json:"id,omitempty"`
|
||||
Statistics map[string]JobStatistics `json:"statistics"`
|
||||
BaseJob
|
||||
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` // Start epoch time stamp in seconds (Min > 0)
|
||||
Statistics map[string]JobStatistics `json:"statistics"` // Metric statistics of job
|
||||
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"`
|
||||
}
|
||||
|
||||
const (
|
||||
@@ -124,18 +117,19 @@ type JobStatistics struct {
|
||||
// Tag model
|
||||
// @Description Defines a tag using name and type.
|
||||
type Tag struct {
|
||||
ID int64 `json:"id" db:"id"` // The unique DB identifier of a tag
|
||||
Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type
|
||||
Name string `json:"name" db:"tag_name" example:"Testjob"` // Tag Name
|
||||
Type string `json:"type" db:"tag_type" example:"Debug"`
|
||||
Name string `json:"name" db:"tag_name" example:"Testjob"`
|
||||
Scope string `json:"scope" db:"tag_scope" example:"global"`
|
||||
ID int64 `json:"id" db:"id"`
|
||||
}
|
||||
|
||||
// Resource model
|
||||
// @Description A resource used by a job
|
||||
type Resource struct {
|
||||
Hostname string `json:"hostname"` // Name of the host (= node)
|
||||
HWThreads []int `json:"hwthreads,omitempty"` // List of OS processor ids
|
||||
Accelerators []string `json:"accelerators,omitempty"` // List of of accelerator device ids
|
||||
Configuration string `json:"configuration,omitempty"` // The configuration options of the node
|
||||
Hostname string `json:"hostname"`
|
||||
Configuration string `json:"configuration,omitempty"`
|
||||
HWThreads []int `json:"hwthreads,omitempty"`
|
||||
Accelerators []string `json:"accelerators,omitempty"`
|
||||
}
|
||||
|
||||
type JobState string
|
||||
|
||||
@@ -10,22 +10,24 @@ import (
|
||||
"math"
|
||||
"sort"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/util"
|
||||
)
|
||||
|
||||
type JobData map[string]map[MetricScope]*JobMetric
|
||||
|
||||
type JobMetric struct {
|
||||
Unit Unit `json:"unit"`
|
||||
Timestep int `json:"timestep"`
|
||||
Series []Series `json:"series"`
|
||||
StatisticsSeries *StatsSeries `json:"statisticsSeries,omitempty"`
|
||||
Unit Unit `json:"unit"`
|
||||
Series []Series `json:"series"`
|
||||
Timestep int `json:"timestep"`
|
||||
}
|
||||
|
||||
type Series struct {
|
||||
Hostname string `json:"hostname"`
|
||||
Id *string `json:"id,omitempty"`
|
||||
Statistics MetricStatistics `json:"statistics"`
|
||||
Hostname string `json:"hostname"`
|
||||
Data []Float `json:"data"`
|
||||
Statistics MetricStatistics `json:"statistics"`
|
||||
}
|
||||
|
||||
type MetricStatistics struct {
|
||||
@@ -35,10 +37,11 @@ type MetricStatistics struct {
|
||||
}
|
||||
|
||||
type StatsSeries struct {
|
||||
Percentiles map[int][]Float `json:"percentiles,omitempty"`
|
||||
Mean []Float `json:"mean"`
|
||||
Median []Float `json:"median"`
|
||||
Min []Float `json:"min"`
|
||||
Max []Float `json:"max"`
|
||||
Percentiles map[int][]Float `json:"percentiles,omitempty"`
|
||||
}
|
||||
|
||||
type MetricScope string
|
||||
@@ -121,6 +124,7 @@ func (jd *JobData) Size() int {
|
||||
if metric.StatisticsSeries != nil {
|
||||
n += len(metric.StatisticsSeries.Max)
|
||||
n += len(metric.StatisticsSeries.Mean)
|
||||
n += len(metric.StatisticsSeries.Median)
|
||||
n += len(metric.StatisticsSeries.Min)
|
||||
}
|
||||
|
||||
@@ -149,53 +153,74 @@ func (jm *JobMetric) AddStatisticsSeries() {
|
||||
}
|
||||
}
|
||||
|
||||
min, mean, max := make([]Float, n), make([]Float, n), make([]Float, n)
|
||||
// mean := make([]Float, n)
|
||||
min, median, max := make([]Float, n), make([]Float, n), make([]Float, n)
|
||||
i := 0
|
||||
for ; i < m; i++ {
|
||||
smin, ssum, smax := math.MaxFloat32, 0.0, -math.MaxFloat32
|
||||
seriesCount := len(jm.Series)
|
||||
// ssum := 0.0
|
||||
smin, smed, smax := math.MaxFloat32, make([]float64, seriesCount), -math.MaxFloat32
|
||||
notnan := 0
|
||||
for j := 0; j < len(jm.Series); j++ {
|
||||
for j := 0; j < seriesCount; j++ {
|
||||
x := float64(jm.Series[j].Data[i])
|
||||
if math.IsNaN(x) {
|
||||
continue
|
||||
}
|
||||
|
||||
notnan += 1
|
||||
ssum += x
|
||||
// ssum += x
|
||||
smed[j] = x
|
||||
smin = math.Min(smin, x)
|
||||
smax = math.Max(smax, x)
|
||||
}
|
||||
|
||||
if notnan < 3 {
|
||||
min[i] = NaN
|
||||
mean[i] = NaN
|
||||
// mean[i] = NaN
|
||||
median[i] = NaN
|
||||
max[i] = NaN
|
||||
} else {
|
||||
min[i] = Float(smin)
|
||||
mean[i] = Float(ssum / float64(notnan))
|
||||
// mean[i] = Float(ssum / float64(notnan))
|
||||
max[i] = Float(smax)
|
||||
|
||||
medianRaw, err := util.Median(smed)
|
||||
if err != nil {
|
||||
median[i] = NaN
|
||||
} else {
|
||||
median[i] = Float(medianRaw)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for ; i < n; i++ {
|
||||
min[i] = NaN
|
||||
mean[i] = NaN
|
||||
// mean[i] = NaN
|
||||
median[i] = NaN
|
||||
max[i] = NaN
|
||||
}
|
||||
|
||||
if smooth {
|
||||
for i := 2; i < len(mean)-2; i++ {
|
||||
for i := 2; i < len(median)-2; i++ {
|
||||
if min[i].IsNaN() {
|
||||
continue
|
||||
}
|
||||
|
||||
min[i] = (min[i-2] + min[i-1] + min[i] + min[i+1] + min[i+2]) / 5
|
||||
max[i] = (max[i-2] + max[i-1] + max[i] + max[i+1] + max[i+2]) / 5
|
||||
mean[i] = (mean[i-2] + mean[i-1] + mean[i] + mean[i+1] + mean[i+2]) / 5
|
||||
// mean[i] = (mean[i-2] + mean[i-1] + mean[i] + mean[i+1] + mean[i+2]) / 5
|
||||
// Reduce Median further
|
||||
smoothRaw := []float64{float64(median[i-2]), float64(median[i-1]), float64(median[i]), float64(median[i+1]), float64(median[i+2])}
|
||||
smoothMedian, err := util.Median(smoothRaw)
|
||||
if err != nil {
|
||||
median[i] = NaN
|
||||
} else {
|
||||
median[i] = Float(smoothMedian)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
jm.StatisticsSeries = &StatsSeries{Mean: mean, Min: min, Max: max}
|
||||
jm.StatisticsSeries = &StatsSeries{Median: median, Min: min, Max: max} // Mean: mean
|
||||
}
|
||||
|
||||
func (jd *JobData) AddNodeScope(metric string) bool {
|
||||
@@ -204,7 +229,7 @@ func (jd *JobData) AddNodeScope(metric string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
var maxScope MetricScope = MetricScopeInvalid
|
||||
maxScope := MetricScopeInvalid
|
||||
for scope := range scopes {
|
||||
maxScope = maxScope.Max(scope)
|
||||
}
|
||||
@@ -266,6 +291,21 @@ func (jd *JobData) AddNodeScope(metric string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (jd *JobData) RoundMetricStats() {
|
||||
// TODO: Make Digit-Precision Configurable? (Currently: Fixed to 2 Digits)
|
||||
for _, scopes := range *jd {
|
||||
for _, jm := range scopes {
|
||||
for index := range jm.Series {
|
||||
jm.Series[index].Statistics = MetricStatistics{
|
||||
Avg: (math.Round(jm.Series[index].Statistics.Avg*100) / 100),
|
||||
Min: (math.Round(jm.Series[index].Statistics.Min*100) / 100),
|
||||
Max: (math.Round(jm.Series[index].Statistics.Max*100) / 100),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (jm *JobMetric) AddPercentiles(ps []int) bool {
|
||||
if jm.StatisticsSeries == nil {
|
||||
jm.AddStatisticsSeries()
|
||||
|
||||
@@ -1,284 +1,327 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://cluster.schema.json",
|
||||
"title": "HPC cluster description",
|
||||
"description": "Meta data information of a HPC cluster",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "The unique identifier of a cluster",
|
||||
"type": "string"
|
||||
},
|
||||
"metricConfig": {
|
||||
"description": "Metric specifications",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "Metric name",
|
||||
"type": "string"
|
||||
},
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"scope": {
|
||||
"description": "Native measurement resolution",
|
||||
"type": "string"
|
||||
},
|
||||
"timestep": {
|
||||
"description": "Frequency of timeseries points",
|
||||
"type": "integer"
|
||||
},
|
||||
"aggregation": {
|
||||
"description": "How the metric is aggregated",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"sum",
|
||||
"avg"
|
||||
]
|
||||
},
|
||||
"peak": {
|
||||
"description": "Metric peak threshold (Upper metric limit)",
|
||||
"type": "number"
|
||||
},
|
||||
"normal": {
|
||||
"description": "Metric normal threshold",
|
||||
"type": "number"
|
||||
},
|
||||
"caution": {
|
||||
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
|
||||
"type": "number"
|
||||
},
|
||||
"alert": {
|
||||
"description": "Metric alert threshold (Requires immediate action)",
|
||||
"type": "number"
|
||||
},
|
||||
"subClusters": {
|
||||
"description": "Array of cluster hardware partition metric thresholds",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "Hardware partition name",
|
||||
"type": "string"
|
||||
},
|
||||
"peak": {
|
||||
"type": "number"
|
||||
},
|
||||
"normal": {
|
||||
"type": "number"
|
||||
},
|
||||
"caution": {
|
||||
"type": "number"
|
||||
},
|
||||
"alert": {
|
||||
"type": "number"
|
||||
},
|
||||
"remove": {
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"unit",
|
||||
"scope",
|
||||
"timestep",
|
||||
"aggregation",
|
||||
"peak",
|
||||
"normal",
|
||||
"caution",
|
||||
"alert"
|
||||
]
|
||||
},
|
||||
"minItems": 1
|
||||
},
|
||||
"subClusters": {
|
||||
"description": "Array of cluster hardware partitions",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "Hardware partition name",
|
||||
"type": "string"
|
||||
},
|
||||
"processorType": {
|
||||
"description": "Processor type",
|
||||
"type": "string"
|
||||
},
|
||||
"socketsPerNode": {
|
||||
"description": "Number of sockets per node",
|
||||
"type": "integer"
|
||||
},
|
||||
"coresPerSocket": {
|
||||
"description": "Number of cores per socket",
|
||||
"type": "integer"
|
||||
},
|
||||
"threadsPerCore": {
|
||||
"description": "Number of SMT threads per core",
|
||||
"type": "integer"
|
||||
},
|
||||
"flopRateScalar": {
|
||||
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"value": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"value": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"description": "Theoretical node peak memory bandwidth in GB/s",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"value": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nodes": {
|
||||
"description": "Node list expression",
|
||||
"type": "string"
|
||||
},
|
||||
"topology": {
|
||||
"description": "Node topology",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"node": {
|
||||
"description": "HwTread lists of node",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"socket": {
|
||||
"description": "HwTread lists of sockets",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"memoryDomain": {
|
||||
"description": "HwTread lists of memory domains",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"die": {
|
||||
"description": "HwTread lists of dies",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"core": {
|
||||
"description": "HwTread lists of cores",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"accelerators": {
|
||||
"type": "array",
|
||||
"description": "List of of accelerator devices",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "The unique device id"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"description": "The accelerator type",
|
||||
"enum": [
|
||||
"Nvidia GPU",
|
||||
"AMD GPU",
|
||||
"Intel GPU"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"type": "string",
|
||||
"description": "The accelerator model"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"type",
|
||||
"model"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node",
|
||||
"socket",
|
||||
"memoryDomain"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"nodes",
|
||||
"topology",
|
||||
"processorType",
|
||||
"socketsPerNode",
|
||||
"coresPerSocket",
|
||||
"threadsPerCore",
|
||||
"flopRateScalar",
|
||||
"flopRateSimd",
|
||||
"memoryBandwidth"
|
||||
]
|
||||
},
|
||||
"minItems": 1
|
||||
}
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://cluster.schema.json",
|
||||
"title": "HPC cluster description",
|
||||
"description": "Meta data information of a HPC cluster",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "The unique identifier of a cluster",
|
||||
"type": "string"
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"metricConfig",
|
||||
"subClusters"
|
||||
]
|
||||
"metricConfig": {
|
||||
"description": "Metric specifications",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "Metric name",
|
||||
"type": "string"
|
||||
},
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"scope": {
|
||||
"description": "Native measurement resolution",
|
||||
"type": "string"
|
||||
},
|
||||
"timestep": {
|
||||
"description": "Frequency of timeseries points",
|
||||
"type": "integer"
|
||||
},
|
||||
"aggregation": {
|
||||
"description": "How the metric is aggregated",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"sum",
|
||||
"avg"
|
||||
]
|
||||
},
|
||||
"footprint": {
|
||||
"description": "Is it a footprint metric and what type",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"avg",
|
||||
"max",
|
||||
"min"
|
||||
]
|
||||
},
|
||||
"energy": {
|
||||
"description": "Is it used to calculate job energy",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"power",
|
||||
"energy"
|
||||
]
|
||||
},
|
||||
"lowerIsBetter": {
|
||||
"description": "Is lower better.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"peak": {
|
||||
"description": "Metric peak threshold (Upper metric limit)",
|
||||
"type": "number"
|
||||
},
|
||||
"normal": {
|
||||
"description": "Metric normal threshold",
|
||||
"type": "number"
|
||||
},
|
||||
"caution": {
|
||||
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
|
||||
"type": "number"
|
||||
},
|
||||
"alert": {
|
||||
"description": "Metric alert threshold (Requires immediate action)",
|
||||
"type": "number"
|
||||
},
|
||||
"subClusters": {
|
||||
"description": "Array of cluster hardware partition metric thresholds",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "Hardware partition name",
|
||||
"type": "string"
|
||||
},
|
||||
"footprint": {
|
||||
"description": "Is it a footprint metric and what type. Overwrite global setting",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"avg",
|
||||
"max",
|
||||
"min"
|
||||
]
|
||||
},
|
||||
"energy": {
|
||||
"description": "Is it used to calculate job energy. Overwrite global",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"power",
|
||||
"energy"
|
||||
]
|
||||
},
|
||||
"lowerIsBetter": {
|
||||
"description": "Is lower better. Overwrite global",
|
||||
"type": "boolean"
|
||||
},
|
||||
"peak": {
|
||||
"type": "number"
|
||||
},
|
||||
"normal": {
|
||||
"type": "number"
|
||||
},
|
||||
"caution": {
|
||||
"type": "number"
|
||||
},
|
||||
"alert": {
|
||||
"type": "number"
|
||||
},
|
||||
"remove": {
|
||||
"description": "Remove this metric for this subcluster",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"unit",
|
||||
"scope",
|
||||
"timestep",
|
||||
"aggregation",
|
||||
"peak",
|
||||
"normal",
|
||||
"caution",
|
||||
"alert"
|
||||
]
|
||||
},
|
||||
"minItems": 1
|
||||
},
|
||||
"subClusters": {
|
||||
"description": "Array of cluster hardware partitions",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "Hardware partition name",
|
||||
"type": "string"
|
||||
},
|
||||
"processorType": {
|
||||
"description": "Processor type",
|
||||
"type": "string"
|
||||
},
|
||||
"socketsPerNode": {
|
||||
"description": "Number of sockets per node",
|
||||
"type": "integer"
|
||||
},
|
||||
"coresPerSocket": {
|
||||
"description": "Number of cores per socket",
|
||||
"type": "integer"
|
||||
},
|
||||
"threadsPerCore": {
|
||||
"description": "Number of SMT threads per core",
|
||||
"type": "integer"
|
||||
},
|
||||
"flopRateScalar": {
|
||||
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"value": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"value": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"description": "Theoretical node peak memory bandwidth in GB/s",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"value": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nodes": {
|
||||
"description": "Node list expression",
|
||||
"type": "string"
|
||||
},
|
||||
"topology": {
|
||||
"description": "Node topology",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"node": {
|
||||
"description": "HwTread lists of node",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"socket": {
|
||||
"description": "HwTread lists of sockets",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"memoryDomain": {
|
||||
"description": "HwTread lists of memory domains",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"die": {
|
||||
"description": "HwTread lists of dies",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"core": {
|
||||
"description": "HwTread lists of cores",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"accelerators": {
|
||||
"type": "array",
|
||||
"description": "List of of accelerator devices",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "The unique device id"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"description": "The accelerator type",
|
||||
"enum": [
|
||||
"Nvidia GPU",
|
||||
"AMD GPU",
|
||||
"Intel GPU"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"type": "string",
|
||||
"description": "The accelerator model"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"type",
|
||||
"model"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node",
|
||||
"socket",
|
||||
"memoryDomain"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"nodes",
|
||||
"topology",
|
||||
"processorType",
|
||||
"socketsPerNode",
|
||||
"coresPerSocket",
|
||||
"threadsPerCore",
|
||||
"flopRateScalar",
|
||||
"flopRateSimd",
|
||||
"memoryBandwidth"
|
||||
]
|
||||
},
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"metricConfig",
|
||||
"subClusters"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,433 +1,497 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://config.schema.json",
|
||||
"title": "cc-backend configuration file schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"addr": {
|
||||
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
|
||||
"type": "string"
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://config.schema.json",
|
||||
"title": "cc-backend configuration file schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"addr": {
|
||||
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
|
||||
"type": "string"
|
||||
},
|
||||
"apiAllowedIPs": {
|
||||
"description": "Addresses from which secured API endpoints can be reached",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"user": {
|
||||
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||
"type": "string"
|
||||
},
|
||||
"group": {
|
||||
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||
"type": "string"
|
||||
},
|
||||
"disable-authentication": {
|
||||
"description": "Disable authentication (for everything: API, Web-UI, ...).",
|
||||
"type": "boolean"
|
||||
},
|
||||
"embed-static-files": {
|
||||
"description": "If all files in `web/frontend/public` should be served from within the binary itself (they are embedded) or not.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"static-files": {
|
||||
"description": "Folder where static assets can be found, if embed-static-files is false.",
|
||||
"type": "string"
|
||||
},
|
||||
"db-driver": {
|
||||
"description": "sqlite3 or mysql (mysql will work for mariadb as well).",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"sqlite3",
|
||||
"mysql"
|
||||
]
|
||||
},
|
||||
"db": {
|
||||
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
|
||||
"type": "string"
|
||||
},
|
||||
"archive": {
|
||||
"description": "Configuration keys for job-archive",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"kind": {
|
||||
"description": "Backend type for job-archive",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"file",
|
||||
"s3"
|
||||
]
|
||||
},
|
||||
"user": {
|
||||
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||
"type": "string"
|
||||
"path": {
|
||||
"description": "Path to job archive for file backend",
|
||||
"type": "string"
|
||||
},
|
||||
"group": {
|
||||
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
|
||||
"type": "string"
|
||||
"compression": {
|
||||
"description": "Setup automatic compression for jobs older than number of days",
|
||||
"type": "integer"
|
||||
},
|
||||
"disable-authentication": {
|
||||
"description": "Disable authentication (for everything: API, Web-UI, ...).",
|
||||
"type": "boolean"
|
||||
},
|
||||
"embed-static-files": {
|
||||
"description": "If all files in `web/frontend/public` should be served from within the binary itself (they are embedded) or not.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"static-files": {
|
||||
"description": "Folder where static assets can be found, if embed-static-files is false.",
|
||||
"type": "string"
|
||||
},
|
||||
"db-driver": {
|
||||
"description": "sqlite3 or mysql (mysql will work for mariadb as well).",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"sqlite3",
|
||||
"mysql"
|
||||
]
|
||||
},
|
||||
"db": {
|
||||
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
|
||||
"type": "string"
|
||||
},
|
||||
"job-archive": {
|
||||
"description": "Configuration keys for job-archive",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"kind": {
|
||||
"description": "Backend type for job-archive",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"file",
|
||||
"s3"
|
||||
]
|
||||
},
|
||||
"path": {
|
||||
"description": "Path to job archive for file backend",
|
||||
"type": "string"
|
||||
},
|
||||
"compression": {
|
||||
"description": "Setup automatic compression for jobs older than number of days",
|
||||
"type": "integer"
|
||||
},
|
||||
"retention": {
|
||||
"description": "Configuration keys for retention",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"policy": {
|
||||
"description": "Retention policy",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"none",
|
||||
"delete",
|
||||
"move"
|
||||
]
|
||||
},
|
||||
"includeDB": {
|
||||
"description": "Also remove jobs from database",
|
||||
"type": "boolean"
|
||||
},
|
||||
"age": {
|
||||
"description": "Act on jobs with startTime older than age (in days)",
|
||||
"type": "integer"
|
||||
},
|
||||
"location": {
|
||||
"description": "The target directory for retention. Only applicable for retention move.",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"policy"
|
||||
]
|
||||
}
|
||||
"retention": {
|
||||
"description": "Configuration keys for retention",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"policy": {
|
||||
"description": "Retention policy",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"none",
|
||||
"delete",
|
||||
"move"
|
||||
]
|
||||
},
|
||||
"required": [
|
||||
"kind"
|
||||
]
|
||||
"includeDB": {
|
||||
"description": "Also remove jobs from database",
|
||||
"type": "boolean"
|
||||
},
|
||||
"age": {
|
||||
"description": "Act on jobs with startTime older than age (in days)",
|
||||
"type": "integer"
|
||||
},
|
||||
"location": {
|
||||
"description": "The target directory for retention. Only applicable for retention move.",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"policy"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"kind"
|
||||
]
|
||||
},
|
||||
"disable-archive": {
|
||||
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"validate": {
|
||||
"description": "Validate all input json documents against json schema.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"session-max-age": {
|
||||
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
|
||||
"type": "string"
|
||||
},
|
||||
"https-cert-file": {
|
||||
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
|
||||
"type": "string"
|
||||
},
|
||||
"https-key-file": {
|
||||
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
|
||||
"type": "string"
|
||||
},
|
||||
"redirect-http-to": {
|
||||
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
|
||||
"type": "string"
|
||||
},
|
||||
"stop-jobs-exceeding-walltime": {
|
||||
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
|
||||
"type": "integer"
|
||||
},
|
||||
"short-running-jobs-duration": {
|
||||
"description": "Do not show running jobs shorter than X seconds.",
|
||||
"type": "integer"
|
||||
},
|
||||
"emission-constant": {
|
||||
"description": ".",
|
||||
"type": "integer"
|
||||
},
|
||||
"cron-frequency": {
|
||||
"description": "Frequency of cron job workers.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"duration-worker": {
|
||||
"description": "Duration Update Worker [Defaults to '5m']",
|
||||
"type": "string"
|
||||
},
|
||||
"disable-archive": {
|
||||
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
|
||||
"type": "boolean"
|
||||
"footprint-worker": {
|
||||
"description": "Metric-Footprint Update Worker [Defaults to '10m']",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"enable-resampling": {
|
||||
"description": "Enable dynamic zoom in frontend metric plots.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"trigger": {
|
||||
"description": "Trigger next zoom level at less than this many visible datapoints.",
|
||||
"type": "integer"
|
||||
},
|
||||
"validate": {
|
||||
"description": "Validate all input json documents against json schema.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"session-max-age": {
|
||||
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
|
||||
"type": "string"
|
||||
},
|
||||
"https-cert-file": {
|
||||
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
|
||||
"type": "string"
|
||||
},
|
||||
"https-key-file": {
|
||||
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
|
||||
"type": "string"
|
||||
},
|
||||
"redirect-http-to": {
|
||||
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
|
||||
"type": "string"
|
||||
},
|
||||
"stop-jobs-exceeding-walltime": {
|
||||
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
|
||||
"resolutions": {
|
||||
"description": "Array of resampling target resolutions, in seconds.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"trigger",
|
||||
"resolutions"
|
||||
]
|
||||
},
|
||||
"jwts": {
|
||||
"description": "For JWT token authentication.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"max-age": {
|
||||
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
|
||||
"type": "string"
|
||||
},
|
||||
"short-running-jobs-duration": {
|
||||
"description": "Do not show running jobs shorter than X seconds.",
|
||||
"type": "integer"
|
||||
"cookieName": {
|
||||
"description": "Cookie that should be checked for a JWT token.",
|
||||
"type": "string"
|
||||
},
|
||||
"jwts": {
|
||||
"description": "For JWT token authentication.",
|
||||
"validateUser": {
|
||||
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"trustedIssuer": {
|
||||
"description": "Issuer that should be accepted when validating external JWTs ",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"max-age"
|
||||
]
|
||||
},
|
||||
"oidc": {
|
||||
"provider": {
|
||||
"description": "",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
},
|
||||
"updateUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
},
|
||||
"required": [
|
||||
"provider"
|
||||
]
|
||||
},
|
||||
"ldap": {
|
||||
"description": "For LDAP Authentication and user synchronisation.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"description": "URL of LDAP directory server.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_base": {
|
||||
"description": "Base DN of user tree root.",
|
||||
"type": "string"
|
||||
},
|
||||
"search_dn": {
|
||||
"description": "DN for authenticating LDAP admin account with general read rights.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_bind": {
|
||||
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_filter": {
|
||||
"description": "Filter to extract users for syncing.",
|
||||
"type": "string"
|
||||
},
|
||||
"username_attr": {
|
||||
"description": "Attribute with full username. Default: gecos",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_interval": {
|
||||
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_del_old_users": {
|
||||
"description": "Delete obsolete users in database.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"url",
|
||||
"user_base",
|
||||
"search_dn",
|
||||
"user_bind",
|
||||
"user_filter"
|
||||
]
|
||||
},
|
||||
"clusters": {
|
||||
"description": "Configuration for the clusters to be displayed.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "The name of the cluster.",
|
||||
"type": "string"
|
||||
},
|
||||
"metricDataRepository": {
|
||||
"description": "Type of the metric data repository for this cluster",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"max-age": {
|
||||
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
|
||||
"type": "string"
|
||||
},
|
||||
"cookieName": {
|
||||
"description": "Cookie that should be checked for a JWT token.",
|
||||
"type": "string"
|
||||
},
|
||||
"validateUser": {
|
||||
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"trustedIssuer": {
|
||||
"description": "Issuer that should be accepted when validating external JWTs ",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
|
||||
"type": "boolean"
|
||||
}
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"influxdb",
|
||||
"prometheus",
|
||||
"cc-metric-store",
|
||||
"test"
|
||||
]
|
||||
},
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"token": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"max-age"
|
||||
"kind",
|
||||
"url"
|
||||
]
|
||||
},
|
||||
"ldap": {
|
||||
"description": "For LDAP Authentication and user synchronisation.",
|
||||
},
|
||||
"filterRanges": {
|
||||
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"description": "URL of LDAP directory server.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_base": {
|
||||
"description": "Base DN of user tree root.",
|
||||
"type": "string"
|
||||
},
|
||||
"search_dn": {
|
||||
"description": "DN for authenticating LDAP admin account with general read rights.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_bind": {
|
||||
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_filter": {
|
||||
"description": "Filter to extract users for syncing.",
|
||||
"type": "string"
|
||||
},
|
||||
"username_attr": {
|
||||
"description": "Attribute with full username. Default: gecos",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_interval": {
|
||||
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_del_old_users": {
|
||||
"description": "Delete obsolete users in database.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"url",
|
||||
"user_base",
|
||||
"search_dn",
|
||||
"user_bind",
|
||||
"user_filter"
|
||||
]
|
||||
},
|
||||
"clusters": {
|
||||
"description": "Configuration for the clusters to be displayed.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"numNodes": {
|
||||
"description": "UI slider range for number of nodes",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "The name of the cluster.",
|
||||
"type": "string"
|
||||
},
|
||||
"metricDataRepository": {
|
||||
"description": "Type of the metric data repository for this cluster",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"influxdb",
|
||||
"prometheus",
|
||||
"cc-metric-store",
|
||||
"test"
|
||||
]
|
||||
},
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"token": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"kind",
|
||||
"url"
|
||||
]
|
||||
},
|
||||
"filterRanges": {
|
||||
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"numNodes": {
|
||||
"description": "UI slider range for number of nodes",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"from",
|
||||
"to"
|
||||
]
|
||||
},
|
||||
"duration": {
|
||||
"description": "UI slider range for duration",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"from",
|
||||
"to"
|
||||
]
|
||||
},
|
||||
"startTime": {
|
||||
"description": "UI slider range for start time",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"to": {
|
||||
"type": "null"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"from",
|
||||
"to"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"numNodes",
|
||||
"duration",
|
||||
"startTime"
|
||||
]
|
||||
}
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"metricDataRepository",
|
||||
"filterRanges"
|
||||
],
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"ui-defaults": {
|
||||
"description": "Default configuration for web UI",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"plot_general_colorBackground": {
|
||||
"description": "Color plot background according to job average threshold limits",
|
||||
"type": "boolean"
|
||||
},
|
||||
"plot_general_lineWidth": {
|
||||
"description": "Initial linewidth",
|
||||
"from",
|
||||
"to"
|
||||
]
|
||||
},
|
||||
"duration": {
|
||||
"description": "UI slider range for duration",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"plot_list_jobsPerPage": {
|
||||
"description": "Jobs shown per page in job lists",
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"plot_view_plotsPerRow": {
|
||||
"description": "Number of plots per row in single job view",
|
||||
"type": "integer"
|
||||
"required": [
|
||||
"from",
|
||||
"to"
|
||||
]
|
||||
},
|
||||
"startTime": {
|
||||
"description": "UI slider range for start time",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"to": {
|
||||
"type": "null"
|
||||
}
|
||||
},
|
||||
"plot_view_showPolarplot": {
|
||||
"description": "Option to toggle polar plot in single job view",
|
||||
"type": "boolean"
|
||||
},
|
||||
"plot_view_showRoofline": {
|
||||
"description": "Option to toggle roofline plot in single job view",
|
||||
"type": "boolean"
|
||||
},
|
||||
"plot_view_showStatTable": {
|
||||
"description": "Option to toggle the node statistic table in single job view",
|
||||
"type": "boolean"
|
||||
},
|
||||
"system_view_selectedMetric": {
|
||||
"description": "Initial metric shown in system view",
|
||||
"type": "string"
|
||||
},
|
||||
"analysis_view_histogramMetrics": {
|
||||
"description": "Metrics to show as job count histograms in analysis view",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"analysis_view_scatterPlotMetrics": {
|
||||
"description": "Initial scatter plto configuration in analysis view",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 2,
|
||||
"maxItems": 2
|
||||
},
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"job_view_nodestats_selectedMetrics": {
|
||||
"description": "Initial metrics shown in node statistics table of single job view",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"job_view_polarPlotMetrics": {
|
||||
"description": "Metrics shown in polar plot of single job view",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"job_view_selectedMetrics": {
|
||||
"description": "",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"plot_general_colorscheme": {
|
||||
"description": "Initial color scheme",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"plot_list_selectedMetrics": {
|
||||
"description": "Initial metric plots shown in jobs lists",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
}
|
||||
"required": [
|
||||
"from",
|
||||
"to"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"plot_general_colorBackground",
|
||||
"plot_general_lineWidth",
|
||||
"plot_list_jobsPerPage",
|
||||
"plot_view_plotsPerRow",
|
||||
"plot_view_showPolarplot",
|
||||
"plot_view_showRoofline",
|
||||
"plot_view_showStatTable",
|
||||
"system_view_selectedMetric",
|
||||
"analysis_view_histogramMetrics",
|
||||
"analysis_view_scatterPlotMetrics",
|
||||
"job_view_nodestats_selectedMetrics",
|
||||
"job_view_polarPlotMetrics",
|
||||
"job_view_selectedMetrics",
|
||||
"plot_general_colorscheme",
|
||||
"plot_list_selectedMetrics"
|
||||
"numNodes",
|
||||
"duration",
|
||||
"startTime"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"metricDataRepository",
|
||||
"filterRanges"
|
||||
],
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"jwts",
|
||||
"clusters"
|
||||
]
|
||||
"ui-defaults": {
|
||||
"description": "Default configuration for web UI",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"plot_general_colorBackground": {
|
||||
"description": "Color plot background according to job average threshold limits",
|
||||
"type": "boolean"
|
||||
},
|
||||
"plot_general_lineWidth": {
|
||||
"description": "Initial linewidth",
|
||||
"type": "integer"
|
||||
},
|
||||
"plot_list_jobsPerPage": {
|
||||
"description": "Jobs shown per page in job lists",
|
||||
"type": "integer"
|
||||
},
|
||||
"plot_view_plotsPerRow": {
|
||||
"description": "Number of plots per row in single job view",
|
||||
"type": "integer"
|
||||
},
|
||||
"plot_view_showPolarplot": {
|
||||
"description": "Option to toggle polar plot in single job view",
|
||||
"type": "boolean"
|
||||
},
|
||||
"plot_view_showRoofline": {
|
||||
"description": "Option to toggle roofline plot in single job view",
|
||||
"type": "boolean"
|
||||
},
|
||||
"plot_view_showStatTable": {
|
||||
"description": "Option to toggle the node statistic table in single job view",
|
||||
"type": "boolean"
|
||||
},
|
||||
"system_view_selectedMetric": {
|
||||
"description": "Initial metric shown in system view",
|
||||
"type": "string"
|
||||
},
|
||||
"job_view_showFootprint": {
|
||||
"description": "Option to toggle footprint ui in single job view",
|
||||
"type": "boolean"
|
||||
},
|
||||
"job_list_usePaging": {
|
||||
"description": "Option to switch from continous scroll to paging",
|
||||
"type": "boolean"
|
||||
},
|
||||
"analysis_view_histogramMetrics": {
|
||||
"description": "Metrics to show as job count histograms in analysis view",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"analysis_view_scatterPlotMetrics": {
|
||||
"description": "Initial scatter plto configuration in analysis view",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 2,
|
||||
"maxItems": 2
|
||||
},
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"job_view_nodestats_selectedMetrics": {
|
||||
"description": "Initial metrics shown in node statistics table of single job view",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"job_view_selectedMetrics": {
|
||||
"description": "Initial metrics shown as plots in single job view",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"plot_general_colorscheme": {
|
||||
"description": "Initial color scheme",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"plot_list_selectedMetrics": {
|
||||
"description": "Initial metric plots shown in jobs lists",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minItems": 1
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"plot_general_colorBackground",
|
||||
"plot_general_lineWidth",
|
||||
"plot_list_jobsPerPage",
|
||||
"plot_view_plotsPerRow",
|
||||
"plot_view_showPolarplot",
|
||||
"plot_view_showRoofline",
|
||||
"plot_view_showStatTable",
|
||||
"system_view_selectedMetric",
|
||||
"job_view_showFootprint",
|
||||
"job_list_usePaging",
|
||||
"analysis_view_histogramMetrics",
|
||||
"analysis_view_scatterPlotMetrics",
|
||||
"job_view_nodestats_selectedMetrics",
|
||||
"job_view_selectedMetrics",
|
||||
"plot_general_colorscheme",
|
||||
"plot_list_selectedMetrics"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"jwts",
|
||||
"clusters"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,490 +1,490 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://job-data.schema.json",
|
||||
"title": "Job metric data list",
|
||||
"description": "Collection of metric data of a HPC job",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"mem_used": {
|
||||
"description": "Memory capacity used",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"flops_any": {
|
||||
"description": "Total flop rate with DP flops scaled up",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"mem_bw": {
|
||||
"description": "Main memory bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"net_bw": {
|
||||
"description": "Total fast interconnect network bandwidth",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"ipc": {
|
||||
"description": "Instructions executed per cycle",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"cpu_user": {
|
||||
"description": "CPU user active core utilization",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"cpu_load": {
|
||||
"description": "CPU requested core utilization (load 1m)",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"flops_dp": {
|
||||
"description": "Double precision flop rate",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"flops_sp": {
|
||||
"description": "Single precision flops rate",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"vectorization_ratio": {
|
||||
"description": "Fraction of arithmetic instructions using SIMD instructions",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"cpu_power": {
|
||||
"description": "CPU power consumption",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"mem_power": {
|
||||
"description": "Memory power consumption",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"acc_utilization": {
|
||||
"description": "GPU utilization",
|
||||
"properties": {
|
||||
"accelerator": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"accelerator"
|
||||
]
|
||||
},
|
||||
"acc_mem_used": {
|
||||
"description": "GPU memory capacity used",
|
||||
"properties": {
|
||||
"accelerator": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"accelerator"
|
||||
]
|
||||
},
|
||||
"acc_power": {
|
||||
"description": "GPU power consumption",
|
||||
"properties": {
|
||||
"accelerator": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"accelerator"
|
||||
]
|
||||
},
|
||||
"clock": {
|
||||
"description": "Average core frequency",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"eth_read_bw": {
|
||||
"description": "Ethernet read bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"eth_write_bw": {
|
||||
"description": "Ethernet write bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"filesystems": {
|
||||
"description": "Array of filesystems",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"nfs",
|
||||
"lustre",
|
||||
"gpfs",
|
||||
"nvme",
|
||||
"ssd",
|
||||
"hdd",
|
||||
"beegfs"
|
||||
]
|
||||
},
|
||||
"read_bw": {
|
||||
"description": "File system read bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"write_bw": {
|
||||
"description": "File system write bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"read_req": {
|
||||
"description": "File system read requests",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"write_req": {
|
||||
"description": "File system write requests",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"inodes": {
|
||||
"description": "File system write requests",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"accesses": {
|
||||
"description": "File system open and close",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"fsync": {
|
||||
"description": "File system fsync",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"create": {
|
||||
"description": "File system create",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"open": {
|
||||
"description": "File system open",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"close": {
|
||||
"description": "File system close",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"seek": {
|
||||
"description": "File system seek",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"type",
|
||||
"read_bw",
|
||||
"write_bw"
|
||||
]
|
||||
},
|
||||
"minItems": 1
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://job-data.schema.json",
|
||||
"title": "Job metric data list",
|
||||
"description": "Collection of metric data of a HPC job",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"mem_used": {
|
||||
"description": "Memory capacity used",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"ic_rcv_packets": {
|
||||
"description": "Network interconnect read packets",
|
||||
"flops_any": {
|
||||
"description": "Total flop rate with DP flops scaled up",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"mem_bw": {
|
||||
"description": "Main memory bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"net_bw": {
|
||||
"description": "Total fast interconnect network bandwidth",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"ipc": {
|
||||
"description": "Instructions executed per cycle",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"cpu_user": {
|
||||
"description": "CPU user active core utilization",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"cpu_load": {
|
||||
"description": "CPU requested core utilization (load 1m)",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"flops_dp": {
|
||||
"description": "Double precision flop rate",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"flops_sp": {
|
||||
"description": "Single precision flops rate",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"vectorization_ratio": {
|
||||
"description": "Fraction of arithmetic instructions using SIMD instructions",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"cpu_power": {
|
||||
"description": "CPU power consumption",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"mem_power": {
|
||||
"description": "Memory power consumption",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"acc_utilization": {
|
||||
"description": "GPU utilization",
|
||||
"properties": {
|
||||
"accelerator": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"accelerator"
|
||||
]
|
||||
},
|
||||
"acc_mem_used": {
|
||||
"description": "GPU memory capacity used",
|
||||
"properties": {
|
||||
"accelerator": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"accelerator"
|
||||
]
|
||||
},
|
||||
"acc_power": {
|
||||
"description": "GPU power consumption",
|
||||
"properties": {
|
||||
"accelerator": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"accelerator"
|
||||
]
|
||||
},
|
||||
"clock": {
|
||||
"description": "Average core frequency",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"socket": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"memoryDomain": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"core": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
},
|
||||
"hwthread": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"eth_read_bw": {
|
||||
"description": "Ethernet read bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"eth_write_bw": {
|
||||
"description": "Ethernet write bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"filesystems": {
|
||||
"description": "Array of filesystems",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"node": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"nfs",
|
||||
"lustre",
|
||||
"gpfs",
|
||||
"nvme",
|
||||
"ssd",
|
||||
"hdd",
|
||||
"beegfs"
|
||||
]
|
||||
},
|
||||
"read_bw": {
|
||||
"description": "File system read bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"write_bw": {
|
||||
"description": "File system write bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"read_req": {
|
||||
"description": "File system read requests",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"write_req": {
|
||||
"description": "File system write requests",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"inodes": {
|
||||
"description": "File system write requests",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"accesses": {
|
||||
"description": "File system open and close",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"fsync": {
|
||||
"description": "File system fsync",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"create": {
|
||||
"description": "File system create",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"open": {
|
||||
"description": "File system open",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"close": {
|
||||
"description": "File system close",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"seek": {
|
||||
"description": "File system seek",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"ic_send_packets": {
|
||||
"description": "Network interconnect send packet",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"ic_read_bw": {
|
||||
"description": "Network interconnect read bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"ic_write_bw": {
|
||||
"description": "Network interconnect write bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
"name",
|
||||
"type",
|
||||
"read_bw",
|
||||
"write_bw"
|
||||
]
|
||||
},
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"ic_rcv_packets": {
|
||||
"description": "Network interconnect read packets",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"cpu_user",
|
||||
"cpu_load",
|
||||
"mem_used",
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"net_bw",
|
||||
"filesystems"
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"ic_send_packets": {
|
||||
"description": "Network interconnect send packet",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"ic_read_bw": {
|
||||
"description": "Network interconnect read bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"ic_write_bw": {
|
||||
"description": "Network interconnect write bandwidth",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"required": [
|
||||
"cpu_user",
|
||||
"cpu_load",
|
||||
"mem_used",
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"net_bw",
|
||||
"filesystems"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,351 +1,351 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://job-meta.schema.json",
|
||||
"title": "Job meta data",
|
||||
"description": "Meta data information of a HPC job",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"description": "The unique identifier of a job",
|
||||
"type": "integer"
|
||||
},
|
||||
"user": {
|
||||
"description": "The unique identifier of a user",
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://job-meta.schema.json",
|
||||
"title": "Job meta data",
|
||||
"description": "Meta data information of a HPC job",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"description": "The unique identifier of a job",
|
||||
"type": "integer"
|
||||
},
|
||||
"user": {
|
||||
"description": "The unique identifier of a user",
|
||||
"type": "string"
|
||||
},
|
||||
"project": {
|
||||
"description": "The unique identifier of a project",
|
||||
"type": "string"
|
||||
},
|
||||
"cluster": {
|
||||
"description": "The unique identifier of a cluster",
|
||||
"type": "string"
|
||||
},
|
||||
"subCluster": {
|
||||
"description": "The unique identifier of a sub cluster",
|
||||
"type": "string"
|
||||
},
|
||||
"partition": {
|
||||
"description": "The Slurm partition to which the job was submitted",
|
||||
"type": "string"
|
||||
},
|
||||
"arrayJobId": {
|
||||
"description": "The unique identifier of an array job",
|
||||
"type": "integer"
|
||||
},
|
||||
"numNodes": {
|
||||
"description": "Number of nodes used",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"numHwthreads": {
|
||||
"description": "Number of HWThreads used",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"numAcc": {
|
||||
"description": "Number of accelerators used",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"exclusive": {
|
||||
"description": "Specifies how nodes are shared. 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive, 2 - Shared among multiple jobs of same user",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 2
|
||||
},
|
||||
"monitoringStatus": {
|
||||
"description": "State of monitoring system during job run",
|
||||
"type": "integer"
|
||||
},
|
||||
"smt": {
|
||||
"description": "SMT threads used by job",
|
||||
"type": "integer"
|
||||
},
|
||||
"walltime": {
|
||||
"description": "Requested walltime of job in seconds",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"jobState": {
|
||||
"description": "Final state of job",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"completed",
|
||||
"failed",
|
||||
"cancelled",
|
||||
"stopped",
|
||||
"out_of_memory",
|
||||
"timeout"
|
||||
]
|
||||
},
|
||||
"startTime": {
|
||||
"description": "Start epoch time stamp in seconds",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"duration": {
|
||||
"description": "Duration of job in seconds",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"resources": {
|
||||
"description": "Resources used by job",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"hostname": {
|
||||
"type": "string"
|
||||
},
|
||||
"project": {
|
||||
"description": "The unique identifier of a project",
|
||||
"type": "string"
|
||||
},
|
||||
"cluster": {
|
||||
"description": "The unique identifier of a cluster",
|
||||
"type": "string"
|
||||
},
|
||||
"subCluster": {
|
||||
"description": "The unique identifier of a sub cluster",
|
||||
"type": "string"
|
||||
},
|
||||
"partition": {
|
||||
"description": "The Slurm partition to which the job was submitted",
|
||||
"type": "string"
|
||||
},
|
||||
"arrayJobId": {
|
||||
"description": "The unique identifier of an array job",
|
||||
"type": "integer"
|
||||
},
|
||||
"numNodes": {
|
||||
"description": "Number of nodes used",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"numHwthreads": {
|
||||
"description": "Number of HWThreads used",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"numAcc": {
|
||||
"description": "Number of accelerators used",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"exclusive": {
|
||||
"description": "Specifies how nodes are shared. 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive, 2 - Shared among multiple jobs of same user",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 2
|
||||
},
|
||||
"monitoringStatus": {
|
||||
"description": "State of monitoring system during job run",
|
||||
"type": "integer"
|
||||
},
|
||||
"smt": {
|
||||
"description": "SMT threads used by job",
|
||||
"type": "integer"
|
||||
},
|
||||
"walltime": {
|
||||
"description": "Requested walltime of job in seconds",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"jobState": {
|
||||
"description": "Final state of job",
|
||||
},
|
||||
"hwthreads": {
|
||||
"type": "array",
|
||||
"description": "List of OS processor ids",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"accelerators": {
|
||||
"type": "array",
|
||||
"description": "List of of accelerator device ids",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"configuration": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"completed",
|
||||
"failed",
|
||||
"cancelled",
|
||||
"stopped",
|
||||
"out_of_memory",
|
||||
"timeout"
|
||||
]
|
||||
"description": "The configuration options of the node"
|
||||
}
|
||||
},
|
||||
"startTime": {
|
||||
"description": "Start epoch time stamp in seconds",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
"required": [
|
||||
"hostname"
|
||||
],
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"metaData": {
|
||||
"description": "Additional information about the job",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobScript": {
|
||||
"type": "string",
|
||||
"description": "The batch script of the job"
|
||||
},
|
||||
"duration": {
|
||||
"description": "Duration of job in seconds",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
"jobName": {
|
||||
"type": "string",
|
||||
"description": "Slurm Job name"
|
||||
},
|
||||
"resources": {
|
||||
"description": "Resources used by job",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"hostname": {
|
||||
"type": "string"
|
||||
},
|
||||
"hwthreads": {
|
||||
"type": "array",
|
||||
"description": "List of OS processor ids",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"accelerators": {
|
||||
"type": "array",
|
||||
"description": "List of of accelerator device ids",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"configuration": {
|
||||
"type": "string",
|
||||
"description": "The configuration options of the node"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"hostname"
|
||||
],
|
||||
"minItems": 1
|
||||
}
|
||||
"slurmInfo": {
|
||||
"type": "string",
|
||||
"description": "Additional slurm infos as show by scontrol show job"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tags": {
|
||||
"description": "List of tags",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"metaData": {
|
||||
"description": "Additional information about the job",
|
||||
"required": [
|
||||
"name",
|
||||
"type"
|
||||
]
|
||||
},
|
||||
"uniqueItems": true
|
||||
},
|
||||
"statistics": {
|
||||
"description": "Job statistic data",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"mem_used": {
|
||||
"description": "Memory capacity used (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"cpu_load": {
|
||||
"description": "CPU requested core utilization (load 1m) (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"flops_any": {
|
||||
"description": "Total flop rate with DP flops scaled up (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"mem_bw": {
|
||||
"description": "Main memory bandwidth (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"net_bw": {
|
||||
"description": "Total fast interconnect network bandwidth (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"file_bw": {
|
||||
"description": "Total file IO bandwidth (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ipc": {
|
||||
"description": "Instructions executed per cycle",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"cpu_user": {
|
||||
"description": "CPU user active core utilization",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"flops_dp": {
|
||||
"description": "Double precision flop rate",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"flops_sp": {
|
||||
"description": "Single precision flops rate",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"rapl_power": {
|
||||
"description": "CPU power consumption",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"acc_used": {
|
||||
"description": "GPU utilization",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"acc_mem_used": {
|
||||
"description": "GPU memory capacity used",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"acc_power": {
|
||||
"description": "GPU power consumption",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"clock": {
|
||||
"description": "Average core frequency",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"eth_read_bw": {
|
||||
"description": "Ethernet read bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"eth_write_bw": {
|
||||
"description": "Ethernet write bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ic_rcv_packets": {
|
||||
"description": "Network interconnect read packets",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ic_send_packets": {
|
||||
"description": "Network interconnect send packet",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ic_read_bw": {
|
||||
"description": "Network interconnect read bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ic_write_bw": {
|
||||
"description": "Network interconnect write bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"filesystems": {
|
||||
"description": "Array of filesystems",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobScript": {
|
||||
"type": "string",
|
||||
"description": "The batch script of the job"
|
||||
},
|
||||
"jobName": {
|
||||
"type": "string",
|
||||
"description": "Slurm Job name"
|
||||
},
|
||||
"slurmInfo": {
|
||||
"type": "string",
|
||||
"description": "Additional slurm infos as show by scontrol show job"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tags": {
|
||||
"description": "List of tags",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"type"
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"nfs",
|
||||
"lustre",
|
||||
"gpfs",
|
||||
"nvme",
|
||||
"ssd",
|
||||
"hdd",
|
||||
"beegfs"
|
||||
]
|
||||
},
|
||||
"uniqueItems": true
|
||||
},
|
||||
"statistics": {
|
||||
"description": "Job statistic data",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"mem_used": {
|
||||
"description": "Memory capacity used (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"cpu_load": {
|
||||
"description": "CPU requested core utilization (load 1m) (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"flops_any": {
|
||||
"description": "Total flop rate with DP flops scaled up (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"mem_bw": {
|
||||
"description": "Main memory bandwidth (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"net_bw": {
|
||||
"description": "Total fast interconnect network bandwidth (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"file_bw": {
|
||||
"description": "Total file IO bandwidth (required)",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ipc": {
|
||||
"description": "Instructions executed per cycle",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"cpu_user": {
|
||||
"description": "CPU user active core utilization",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"flops_dp": {
|
||||
"description": "Double precision flop rate",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"flops_sp": {
|
||||
"description": "Single precision flops rate",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"rapl_power": {
|
||||
"description": "CPU power consumption",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"acc_used": {
|
||||
"description": "GPU utilization",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"acc_mem_used": {
|
||||
"description": "GPU memory capacity used",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"acc_power": {
|
||||
"description": "GPU power consumption",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"clock": {
|
||||
"description": "Average core frequency",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"eth_read_bw": {
|
||||
"description": "Ethernet read bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"eth_write_bw": {
|
||||
"description": "Ethernet write bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ic_rcv_packets": {
|
||||
"description": "Network interconnect read packets",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ic_send_packets": {
|
||||
"description": "Network interconnect send packet",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ic_read_bw": {
|
||||
"description": "Network interconnect read bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"ic_write_bw": {
|
||||
"description": "Network interconnect write bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"filesystems": {
|
||||
"description": "Array of filesystems",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"nfs",
|
||||
"lustre",
|
||||
"gpfs",
|
||||
"nvme",
|
||||
"ssd",
|
||||
"hdd",
|
||||
"beegfs"
|
||||
]
|
||||
},
|
||||
"read_bw": {
|
||||
"description": "File system read bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"write_bw": {
|
||||
"description": "File system write bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"read_req": {
|
||||
"description": "File system read requests",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"write_req": {
|
||||
"description": "File system write requests",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"inodes": {
|
||||
"description": "File system write requests",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"accesses": {
|
||||
"description": "File system open and close",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"fsync": {
|
||||
"description": "File system fsync",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"create": {
|
||||
"description": "File system create",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"open": {
|
||||
"description": "File system open",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"close": {
|
||||
"description": "File system close",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"seek": {
|
||||
"description": "File system seek",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"type",
|
||||
"read_bw",
|
||||
"write_bw"
|
||||
]
|
||||
},
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"read_bw": {
|
||||
"description": "File system read bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"write_bw": {
|
||||
"description": "File system write bandwidth",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"read_req": {
|
||||
"description": "File system read requests",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"write_req": {
|
||||
"description": "File system write requests",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"inodes": {
|
||||
"description": "File system write requests",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"accesses": {
|
||||
"description": "File system open and close",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"fsync": {
|
||||
"description": "File system fsync",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"create": {
|
||||
"description": "File system create",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"open": {
|
||||
"description": "File system open",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"close": {
|
||||
"description": "File system close",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"seek": {
|
||||
"description": "File system seek",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"cpu_user",
|
||||
"cpu_load",
|
||||
"mem_used",
|
||||
"flops_any",
|
||||
"mem_bw"
|
||||
"name",
|
||||
"type",
|
||||
"read_bw",
|
||||
"write_bw"
|
||||
]
|
||||
},
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"jobId",
|
||||
"user",
|
||||
"project",
|
||||
"cluster",
|
||||
"subCluster",
|
||||
"numNodes",
|
||||
"exclusive",
|
||||
"startTime",
|
||||
"jobState",
|
||||
"duration",
|
||||
"resources",
|
||||
"statistics"
|
||||
]
|
||||
},
|
||||
"required": [
|
||||
"cpu_user",
|
||||
"cpu_load",
|
||||
"mem_used",
|
||||
"flops_any",
|
||||
"mem_bw"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"jobId",
|
||||
"user",
|
||||
"project",
|
||||
"cluster",
|
||||
"subCluster",
|
||||
"numNodes",
|
||||
"exclusive",
|
||||
"startTime",
|
||||
"jobState",
|
||||
"duration",
|
||||
"resources",
|
||||
"statistics"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,216 +1,216 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://job-metric-data.schema.json",
|
||||
"title": "Job metric data",
|
||||
"description": "Metric data of a HPC job",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"timestep": {
|
||||
"description": "Measurement interval in seconds",
|
||||
"type": "integer"
|
||||
},
|
||||
"thresholds": {
|
||||
"description": "Metric thresholds for specific system",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"peak": {
|
||||
"type": "number"
|
||||
},
|
||||
"normal": {
|
||||
"type": "number"
|
||||
},
|
||||
"caution": {
|
||||
"type": "number"
|
||||
},
|
||||
"alert": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"statisticsSeries": {
|
||||
"type": "object",
|
||||
"description": "Statistics series across topology",
|
||||
"properties": {
|
||||
"min": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"max": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"mean": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"percentiles": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"10": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"20": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"30": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"40": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"50": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"60": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"70": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"80": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"90": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"25": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"75": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"series": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"hostname": {
|
||||
"type": "string"
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"statistics": {
|
||||
"type": "object",
|
||||
"description": "Statistics across time dimension",
|
||||
"properties": {
|
||||
"avg": {
|
||||
"description": "Series average",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"min": {
|
||||
"description": "Series minimum",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"max": {
|
||||
"description": "Series maximum",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"avg",
|
||||
"min",
|
||||
"max"
|
||||
]
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"contains": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"hostname",
|
||||
"statistics",
|
||||
"data"
|
||||
]
|
||||
}
|
||||
}
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://job-metric-data.schema.json",
|
||||
"title": "Job metric data",
|
||||
"description": "Metric data of a HPC job",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"required": [
|
||||
"unit",
|
||||
"timestep",
|
||||
"series"
|
||||
]
|
||||
"timestep": {
|
||||
"description": "Measurement interval in seconds",
|
||||
"type": "integer"
|
||||
},
|
||||
"thresholds": {
|
||||
"description": "Metric thresholds for specific system",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"peak": {
|
||||
"type": "number"
|
||||
},
|
||||
"normal": {
|
||||
"type": "number"
|
||||
},
|
||||
"caution": {
|
||||
"type": "number"
|
||||
},
|
||||
"alert": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"statisticsSeries": {
|
||||
"type": "object",
|
||||
"description": "Statistics series across topology",
|
||||
"properties": {
|
||||
"min": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"max": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"mean": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"percentiles": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"10": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"20": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"30": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"40": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"50": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"60": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"70": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"80": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"90": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"25": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"75": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"series": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"hostname": {
|
||||
"type": "string"
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"statistics": {
|
||||
"type": "object",
|
||||
"description": "Statistics across time dimension",
|
||||
"properties": {
|
||||
"avg": {
|
||||
"description": "Series average",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"min": {
|
||||
"description": "Series minimum",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"max": {
|
||||
"description": "Series maximum",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"avg",
|
||||
"min",
|
||||
"max"
|
||||
]
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"contains": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"minItems": 1
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"hostname",
|
||||
"statistics",
|
||||
"data"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"unit",
|
||||
"timestep",
|
||||
"series"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,34 +1,34 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://job-metric-statistics.schema.json",
|
||||
"title": "Job statistics",
|
||||
"description": "Format specification for job metric statistics",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"avg": {
|
||||
"description": "Job metric average",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"min": {
|
||||
"description": "Job metric minimum",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"max": {
|
||||
"description": "Job metric maximum",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
}
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://job-metric-statistics.schema.json",
|
||||
"title": "Job statistics",
|
||||
"description": "Format specification for job metric statistics",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"required": [
|
||||
"unit",
|
||||
"avg",
|
||||
"min",
|
||||
"max"
|
||||
]
|
||||
"avg": {
|
||||
"description": "Job metric average",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"min": {
|
||||
"description": "Job metric minimum",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"max": {
|
||||
"description": "Job metric maximum",
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"unit",
|
||||
"avg",
|
||||
"min",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,40 +1,40 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://unit.schema.json",
|
||||
"title": "Metric unit",
|
||||
"description": "Format specification for job metric units",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"base": {
|
||||
"description": "Metric base unit",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"B",
|
||||
"F",
|
||||
"B/s",
|
||||
"F/s",
|
||||
"CPI",
|
||||
"IPC",
|
||||
"Hz",
|
||||
"W",
|
||||
"°C",
|
||||
""
|
||||
]
|
||||
},
|
||||
"prefix": {
|
||||
"description": "Unit prefix",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"K",
|
||||
"M",
|
||||
"G",
|
||||
"T",
|
||||
"P",
|
||||
"E"
|
||||
]
|
||||
}
|
||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "embedfs://unit.schema.json",
|
||||
"title": "Metric unit",
|
||||
"description": "Format specification for job metric units",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"base": {
|
||||
"description": "Metric base unit",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"B",
|
||||
"F",
|
||||
"B/s",
|
||||
"F/s",
|
||||
"CPI",
|
||||
"IPC",
|
||||
"Hz",
|
||||
"W",
|
||||
"°C",
|
||||
""
|
||||
]
|
||||
},
|
||||
"required": [
|
||||
"base"
|
||||
]
|
||||
"prefix": {
|
||||
"description": "Unit prefix",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"K",
|
||||
"M",
|
||||
"G",
|
||||
"T",
|
||||
"P",
|
||||
"E"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"base"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -42,11 +42,11 @@ type User struct {
|
||||
Username string `json:"username"`
|
||||
Password string `json:"-"`
|
||||
Name string `json:"name"`
|
||||
Email string `json:"email"`
|
||||
Roles []string `json:"roles"`
|
||||
Projects []string `json:"projects"`
|
||||
AuthType AuthType `json:"authType"`
|
||||
AuthSource AuthSource `json:"authSource"`
|
||||
Email string `json:"email"`
|
||||
Projects []string `json:"projects"`
|
||||
}
|
||||
|
||||
func (u *User) HasProject(project string) bool {
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
// type Accelerator struct {
|
||||
// ID string `json:"id"`
|
||||
// Type string `json:"type"`
|
||||
// Model string `json:"model"`
|
||||
// }
|
||||
|
||||
// type Topology struct {
|
||||
// Node []int `json:"node"`
|
||||
// Socket [][]int `json:"socket"`
|
||||
// MemoryDomain [][]int `json:"memoryDomain"`
|
||||
// Die [][]int `json:"die"`
|
||||
// Core [][]int `json:"core"`
|
||||
// Accelerators []*Accelerator `json:"accelerators"`
|
||||
// }
|
||||
|
||||
type SubCluster struct {
|
||||
Name string `json:"name"`
|
||||
Nodes string `json:"nodes"`
|
||||
NumberOfNodes int `json:"numberOfNodes"`
|
||||
ProcessorType string `json:"processorType"`
|
||||
SocketsPerNode int `json:"socketsPerNode"`
|
||||
CoresPerSocket int `json:"coresPerSocket"`
|
||||
ThreadsPerCore int `json:"threadsPerCore"`
|
||||
FlopRateScalar int `json:"flopRateScalar"`
|
||||
FlopRateSimd int `json:"flopRateSimd"`
|
||||
MemoryBandwidth int `json:"memoryBandwidth"`
|
||||
Topology *schema.Topology `json:"topology"`
|
||||
}
|
||||
|
||||
// type SubClusterConfig struct {
|
||||
// Name string `json:"name"`
|
||||
// Peak float64 `json:"peak"`
|
||||
// Normal float64 `json:"normal"`
|
||||
// Caution float64 `json:"caution"`
|
||||
// Alert float64 `json:"alert"`
|
||||
// }
|
||||
|
||||
type MetricConfig struct {
|
||||
Name string `json:"name"`
|
||||
Unit string `json:"unit"`
|
||||
Scope schema.MetricScope `json:"scope"`
|
||||
Aggregation string `json:"aggregation"`
|
||||
Timestep int `json:"timestep"`
|
||||
Peak float64 `json:"peak"`
|
||||
Normal float64 `json:"normal"`
|
||||
Caution float64 `json:"caution"`
|
||||
Alert float64 `json:"alert"`
|
||||
SubClusters []*schema.SubClusterConfig `json:"subClusters"`
|
||||
}
|
||||
|
||||
type Cluster struct {
|
||||
Name string `json:"name"`
|
||||
MetricConfig []*MetricConfig `json:"metricConfig"`
|
||||
SubClusters []*SubCluster `json:"subClusters"`
|
||||
}
|
||||
@@ -1,166 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
var Clusters []*Cluster
|
||||
var nodeLists map[string]map[string]archive.NodeList
|
||||
|
||||
func initClusterConfig() error {
|
||||
|
||||
Clusters = []*Cluster{}
|
||||
nodeLists = map[string]map[string]archive.NodeList{}
|
||||
|
||||
for _, c := range ar.GetClusters() {
|
||||
|
||||
cluster, err := ar.LoadClusterCfg(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(cluster.Name) == 0 ||
|
||||
len(cluster.MetricConfig) == 0 ||
|
||||
len(cluster.SubClusters) == 0 {
|
||||
return errors.New("cluster.name, cluster.metricConfig and cluster.SubClusters should not be empty")
|
||||
}
|
||||
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
if len(mc.Name) == 0 {
|
||||
return errors.New("cluster.metricConfig.name should not be empty")
|
||||
}
|
||||
if mc.Timestep < 1 {
|
||||
return errors.New("cluster.metricConfig.timestep should not be smaller than one")
|
||||
}
|
||||
|
||||
// For backwards compability...
|
||||
if mc.Scope == "" {
|
||||
mc.Scope = schema.MetricScopeNode
|
||||
}
|
||||
if !mc.Scope.Valid() {
|
||||
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
|
||||
}
|
||||
}
|
||||
|
||||
Clusters = append(Clusters, cluster)
|
||||
|
||||
nodeLists[cluster.Name] = make(map[string]archive.NodeList)
|
||||
for _, sc := range cluster.SubClusters {
|
||||
if sc.Nodes == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
nl, err := archive.ParseNodeList(sc.Nodes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("in %s/cluster.json: %w", cluster.Name, err)
|
||||
}
|
||||
nodeLists[cluster.Name][sc.Name] = nl
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetCluster(cluster string) *Cluster {
|
||||
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
return c
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetSubCluster(cluster, subcluster string) *SubCluster {
|
||||
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
for _, p := range c.SubClusters {
|
||||
if p.Name == subcluster {
|
||||
return p
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetMetricConfig(cluster, metric string) *MetricConfig {
|
||||
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
for _, m := range c.MetricConfig {
|
||||
if m.Name == metric {
|
||||
return m
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AssignSubCluster sets the `job.subcluster` property of the job based
|
||||
// on its cluster and resources.
|
||||
func AssignSubCluster(job *BaseJob) error {
|
||||
|
||||
cluster := GetCluster(job.Cluster)
|
||||
if cluster == nil {
|
||||
return fmt.Errorf("unkown cluster: %#v", job.Cluster)
|
||||
}
|
||||
|
||||
if job.SubCluster != "" {
|
||||
for _, sc := range cluster.SubClusters {
|
||||
if sc.Name == job.SubCluster {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("already assigned subcluster %#v unkown (cluster: %#v)", job.SubCluster, job.Cluster)
|
||||
}
|
||||
|
||||
if len(job.Resources) == 0 {
|
||||
return fmt.Errorf("job without any resources/hosts")
|
||||
}
|
||||
|
||||
host0 := job.Resources[0].Hostname
|
||||
for sc, nl := range nodeLists[job.Cluster] {
|
||||
if nl != nil && nl.Contains(host0) {
|
||||
job.SubCluster = sc
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
if cluster.SubClusters[0].Nodes == "" {
|
||||
job.SubCluster = cluster.SubClusters[0].Name
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("no subcluster found for cluster %#v and host %#v", job.Cluster, host0)
|
||||
}
|
||||
|
||||
func GetSubClusterByNode(cluster, hostname string) (string, error) {
|
||||
|
||||
for sc, nl := range nodeLists[cluster] {
|
||||
if nl != nil && nl.Contains(hostname) {
|
||||
return sc, nil
|
||||
}
|
||||
}
|
||||
|
||||
c := GetCluster(cluster)
|
||||
if c == nil {
|
||||
return "", fmt.Errorf("unkown cluster: %#v", cluster)
|
||||
}
|
||||
|
||||
if c.SubClusters[0].Nodes == "" {
|
||||
return c.SubClusters[0].Name, nil
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("no subcluster found for cluster %#v and host %#v", cluster, hostname)
|
||||
}
|
||||
@@ -1,109 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"math"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// A custom float type is used so that (Un)MarshalJSON and
|
||||
// (Un)MarshalGQL can be overloaded and NaN/null can be used.
|
||||
// The default behaviour of putting every nullable value behind
|
||||
// a pointer has a bigger overhead.
|
||||
type Float float64
|
||||
|
||||
var NaN Float = Float(math.NaN())
|
||||
var nullAsBytes []byte = []byte("null")
|
||||
|
||||
func (f Float) IsNaN() bool {
|
||||
return math.IsNaN(float64(f))
|
||||
}
|
||||
|
||||
// NaN will be serialized to `null`.
|
||||
func (f Float) MarshalJSON() ([]byte, error) {
|
||||
if f.IsNaN() {
|
||||
return nullAsBytes, nil
|
||||
}
|
||||
|
||||
return strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64), nil
|
||||
}
|
||||
|
||||
// `null` will be unserialized to NaN.
|
||||
func (f *Float) UnmarshalJSON(input []byte) error {
|
||||
s := string(input)
|
||||
if s == "null" {
|
||||
*f = NaN
|
||||
return nil
|
||||
}
|
||||
|
||||
val, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
*f = Float(val)
|
||||
return nil
|
||||
}
|
||||
|
||||
// UnmarshalGQL implements the graphql.Unmarshaler interface.
|
||||
func (f *Float) UnmarshalGQL(v interface{}) error {
|
||||
f64, ok := v.(float64)
|
||||
if !ok {
|
||||
return errors.New("invalid Float scalar")
|
||||
}
|
||||
|
||||
*f = Float(f64)
|
||||
return nil
|
||||
}
|
||||
|
||||
// MarshalGQL implements the graphql.Marshaler interface.
|
||||
// NaN will be serialized to `null`.
|
||||
func (f Float) MarshalGQL(w io.Writer) {
|
||||
if f.IsNaN() {
|
||||
w.Write(nullAsBytes)
|
||||
} else {
|
||||
w.Write(strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64))
|
||||
}
|
||||
}
|
||||
|
||||
// Only used via REST-API, not via GraphQL.
|
||||
// This uses a lot less allocations per series,
|
||||
// but it turns out that the performance increase
|
||||
// from using this is not that big.
|
||||
func (s *Series) MarshalJSON() ([]byte, error) {
|
||||
buf := make([]byte, 0, 512+len(s.Data)*8)
|
||||
buf = append(buf, `{"hostname":"`...)
|
||||
buf = append(buf, s.Hostname...)
|
||||
buf = append(buf, '"')
|
||||
if s.Id != nil {
|
||||
buf = append(buf, `,"id":`...)
|
||||
buf = strconv.AppendInt(buf, int64(*s.Id), 10)
|
||||
}
|
||||
if s.Statistics != nil {
|
||||
buf = append(buf, `,"statistics":{"min":`...)
|
||||
buf = strconv.AppendFloat(buf, s.Statistics.Min, 'f', 2, 64)
|
||||
buf = append(buf, `,"avg":`...)
|
||||
buf = strconv.AppendFloat(buf, s.Statistics.Avg, 'f', 2, 64)
|
||||
buf = append(buf, `,"max":`...)
|
||||
buf = strconv.AppendFloat(buf, s.Statistics.Max, 'f', 2, 64)
|
||||
buf = append(buf, '}')
|
||||
}
|
||||
buf = append(buf, `,"data":[`...)
|
||||
for i := 0; i < len(s.Data); i++ {
|
||||
if i != 0 {
|
||||
buf = append(buf, ',')
|
||||
}
|
||||
|
||||
if s.Data[i].IsNaN() {
|
||||
buf = append(buf, `null`...)
|
||||
} else {
|
||||
buf = strconv.AppendFloat(buf, float64(s.Data[i]), 'f', 2, 32)
|
||||
}
|
||||
}
|
||||
buf = append(buf, ']', '}')
|
||||
return buf, nil
|
||||
}
|
||||
@@ -1,142 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
)
|
||||
|
||||
type FsArchiveConfig struct {
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
type FsArchive struct {
|
||||
path string
|
||||
clusters []string
|
||||
}
|
||||
|
||||
func getPath(
|
||||
job *JobMeta,
|
||||
rootPath string,
|
||||
file string) string {
|
||||
|
||||
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
|
||||
return filepath.Join(
|
||||
rootPath,
|
||||
job.Cluster,
|
||||
lvl1, lvl2,
|
||||
strconv.FormatInt(job.StartTime, 10), file)
|
||||
}
|
||||
|
||||
func loadJobMeta(filename string) (*JobMeta, error) {
|
||||
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
log.Errorf("fsBackend loadJobMeta()- %v", err)
|
||||
return &JobMeta{}, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return DecodeJobMeta(bufio.NewReader(f))
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) Init(rawConfig json.RawMessage) error {
|
||||
|
||||
var config FsArchiveConfig
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
log.Errorf("fsBackend Init()- %v", err)
|
||||
return err
|
||||
}
|
||||
if config.Path == "" {
|
||||
err := fmt.Errorf("fsBackend Init()- empty path")
|
||||
log.Errorf("fsBackend Init()- %v", err)
|
||||
return err
|
||||
}
|
||||
fsa.path = config.Path
|
||||
|
||||
entries, err := os.ReadDir(fsa.path)
|
||||
if err != nil {
|
||||
log.Errorf("fsBackend Init()- %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
for _, de := range entries {
|
||||
fsa.clusters = append(fsa.clusters, de.Name())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) Iter() <-chan *JobMeta {
|
||||
|
||||
ch := make(chan *JobMeta)
|
||||
go func() {
|
||||
clustersDir, err := os.ReadDir(fsa.path)
|
||||
if err != nil {
|
||||
log.Fatalf("Reading clusters failed: %s", err.Error())
|
||||
}
|
||||
|
||||
for _, clusterDir := range clustersDir {
|
||||
lvl1Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name()))
|
||||
if err != nil {
|
||||
log.Fatalf("Reading jobs failed: %s", err.Error())
|
||||
}
|
||||
|
||||
for _, lvl1Dir := range lvl1Dirs {
|
||||
if !lvl1Dir.IsDir() {
|
||||
// Could be the cluster.json file
|
||||
continue
|
||||
}
|
||||
|
||||
lvl2Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name(), lvl1Dir.Name()))
|
||||
if err != nil {
|
||||
log.Fatalf("Reading jobs failed: %s", err.Error())
|
||||
}
|
||||
|
||||
for _, lvl2Dir := range lvl2Dirs {
|
||||
dirpath := filepath.Join(fsa.path, clusterDir.Name(), lvl1Dir.Name(), lvl2Dir.Name())
|
||||
startTimeDirs, err := os.ReadDir(dirpath)
|
||||
if err != nil {
|
||||
log.Fatalf("Reading jobs failed: %s", err.Error())
|
||||
}
|
||||
|
||||
for _, startTimeDir := range startTimeDirs {
|
||||
if startTimeDir.IsDir() {
|
||||
job, err := loadJobMeta(filepath.Join(dirpath, startTimeDir.Name(), "meta.json"))
|
||||
if err != nil {
|
||||
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
|
||||
} else {
|
||||
ch <- job
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(ch)
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) LoadClusterCfg(name string) (*Cluster, error) {
|
||||
b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json"))
|
||||
if err != nil {
|
||||
log.Errorf("fsBackend LoadClusterCfg()- %v", err)
|
||||
return &Cluster{}, err
|
||||
}
|
||||
return DecodeCluster(bytes.NewReader(b))
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) GetClusters() []string {
|
||||
return fsa.clusters
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user