Merge branch 'dev' into 134-job-tagging

This commit is contained in:
Jan Eitzinger 2025-05-13 14:48:58 +02:00
commit 13386175f5
Signed by: moebiusband
GPG Key ID: 2574BA29B90D6DD5
232 changed files with 32360 additions and 16001 deletions

View File

@ -1,331 +0,0 @@
# See: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions
# Workflow name
name: Release
# Run on tag push
on:
push:
tags:
- '**'
jobs:
#
# Build on AlmaLinux 8.5 using golang-1.18.2
#
AlmaLinux-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:8.5
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
outputs:
rpm : ${{steps.rpmrename.outputs.RPM}}
srpm : ${{steps.rpmrename.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which npm
dnf --assumeyes install 'dnf-command(builddep)'
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
rpm -i go*.rpm
npm install --global yarn rollup svelte rollup-plugin-svelte
#dnf --assumeyes builddep build/package/cc-backend.spec
- name: RPM build ClusterCockpit
id: rpmbuild
run: make RPM
# AlmaLinux 8.5 is a derivate of RedHat Enterprise Linux 8 (UBI8),
# so the created RPM both contain the substring 'el8' in the RPM file names
# This step replaces the substring 'el8' to 'alma85'. It uses the move operation
# because it is unclear whether the default AlmaLinux 8.5 container contains the
# 'rename' command. This way we also get the new names for output.
- name: Rename RPMs (s/el8/alma85/)
id: rpmrename
run: |
OLD_RPM="${{steps.rpmbuild.outputs.RPM}}"
OLD_SRPM="${{steps.rpmbuild.outputs.SRPM}}"
NEW_RPM="${OLD_RPM/el8/alma85}"
NEW_SRPM=${OLD_SRPM/el8/alma85}
mv "${OLD_RPM}" "${NEW_RPM}"
mv "${OLD_SRPM}" "${NEW_SRPM}"
echo "::set-output name=SRPM::${NEW_SRPM}"
echo "::set-output name=RPM::${NEW_RPM}"
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend RPM for AlmaLinux 8.5
path: ${{ steps.rpmrename.outputs.RPM }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend SRPM for AlmaLinux 8.5
path: ${{ steps.rpmrename.outputs.SRPM }}
#
# Build on UBI 8 using golang-1.18.2
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: registry.access.redhat.com/ubi8/ubi:8.5-226.1645809065
# The job outputs link to the outputs of the 'rpmbuild' step
outputs:
rpm : ${{steps.rpmbuild.outputs.RPM}}
srpm : ${{steps.rpmbuild.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
rpm -i go*.rpm
dnf --assumeyes --disableplugin=subscription-manager install npm
npm install --global yarn rollup svelte rollup-plugin-svelte
#dnf --assumeyes builddep build/package/cc-backend.spec
- name: RPM build ClusterCockpit
id: rpmbuild
run: make RPM
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend RPM for UBI 8
path: ${{ steps.rpmbuild.outputs.RPM }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend SRPM for UBI 8
path: ${{ steps.rpmbuild.outputs.SRPM }}
#
# Build on Ubuntu 20.04 using official go 1.19.1 package
#
Ubuntu-focal-build:
runs-on: ubuntu-latest
container: ubuntu:20.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
apt --assume-yes install npm
npm install --global yarn rollup svelte rollup-plugin-svelte
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
- name: DEB build ClusterCockpit
id: dpkg-build
run: |
ls -la
pwd
env
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
git config --global --add safe.directory $(pwd)
make DEB
- name: Rename DEB (add '_ubuntu20.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "::set-output name=DEB::${NEW_DEB_FILE}"
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend DEB for Ubuntu 20.04
path: ${{ steps.debrename.outputs.DEB }}
#
# Build on Ubuntu 20.04 using official go 1.19.1 package
#
Ubuntu-jammy-build:
runs-on: ubuntu-latest
container: ubuntu:22.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash npm
npm install --global yarn rollup svelte rollup-plugin-svelte
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
- name: DEB build ClusterCockpit
id: dpkg-build
run: |
ls -la
pwd
env
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
git config --global --add safe.directory $(pwd)
make DEB
- name: Rename DEB (add '_ubuntu22.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu22.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "::set-output name=DEB::${NEW_DEB_FILE}"
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v2
with:
name: cc-backend DEB for Ubuntu 22.04
path: ${{ steps.debrename.outputs.DEB }}
#
# Create release with fresh RPMs
#
Release:
runs-on: ubuntu-latest
# We need the RPMs, so add dependency
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build, Ubuntu-jammy-build]
steps:
# See: https://github.com/actions/download-artifact
- name: Download AlmaLinux 8.5 RPM
uses: actions/download-artifact@v2
with:
name: cc-backend RPM for AlmaLinux 8.5
- name: Download AlmaLinux 8.5 SRPM
uses: actions/download-artifact@v2
with:
name: cc-backend SRPM for AlmaLinux 8.5
- name: Download UBI 8 RPM
uses: actions/download-artifact@v2
with:
name: cc-backend RPM for UBI 8
- name: Download UBI 8 SRPM
uses: actions/download-artifact@v2
with:
name: cc-backend SRPM for UBI 8
- name: Download Ubuntu 20.04 DEB
uses: actions/download-artifact@v2
with:
name: cc-backend DEB for Ubuntu 20.04
- name: Download Ubuntu 22.04 DEB
uses: actions/download-artifact@v2
with:
name: cc-backend DEB for Ubuntu 22.04
# The download actions do not publish the name of the downloaded file,
# so we re-use the job outputs of the parent jobs. The files are all
# downloaded to the current folder.
# The gh-release action afterwards does not accept file lists but all
# files have to be listed at 'files'. The step creates one output per
# RPM package (2 per distro)
- name: Set RPM variables
id: files
run: |
ALMA_85_RPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.rpm}}")
ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}")
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}")
U_2204_DEB=$(basename "${{ needs.Ubuntu-jammy-build.outputs.deb}}")
echo "ALMA_85_RPM::${ALMA_85_RPM}"
echo "ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "UBI_8_RPM::${UBI_8_RPM}"
echo "UBI_8_SRPM::${UBI_8_SRPM}"
echo "U_2004_DEB::${U_2004_DEB}"
echo "U_2204_DEB::${U_2204_DEB}"
echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}"
echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}"
echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}"
echo "::set-output name=U_2004_DEB::${U_2004_DEB}"
echo "::set-output name=U_2204_DEB::${U_2204_DEB}"
# See: https://github.com/softprops/action-gh-release
- name: Release
uses: softprops/action-gh-release@v1
if: startsWith(github.ref, 'refs/tags/')
with:
name: cc-backend-${{github.ref_name}}
files: |
${{ steps.files.outputs.ALMA_85_RPM }}
${{ steps.files.outputs.ALMA_85_SRPM }}
${{ steps.files.outputs.UBI_8_RPM }}
${{ steps.files.outputs.UBI_8_SRPM }}
${{ steps.files.outputs.U_2004_DEB }}
${{ steps.files.outputs.U_2204_DEB }}

View File

@ -7,7 +7,7 @@ jobs:
- name: Install Go
uses: actions/setup-go@v4
with:
go-version: 1.20.x
go-version: 1.24.x
- name: Checkout code
uses: actions/checkout@v3
- name: Build, Vet & Test

20
.gitignore vendored
View File

@ -1,19 +1,23 @@
/cc-backend
/var/job-archive
/var/*.db
/var/machine-state
/.env
/config.json
/var/job-archive
/var/machine-state
/var/job.db-shm
/var/job.db-wal
/var/*.db
/var/*.txt
/web/frontend/public/build
/web/frontend/node_modules
/.vscode/*
/archive-migration
/archive-manager
var/job.db-shm
var/job.db-wal
/internal/repository/testdata/job.db-shm
/internal/repository/testdata/job.db-wal
/.vscode/*
dist/
*.db

View File

@ -34,19 +34,6 @@ builds:
main: ./tools/archive-manager
tags:
- static_build
- env:
- CGO_ENABLED=0
goos:
- linux
goarch:
- amd64
goamd64:
- v3
id: "archive-migration"
binary: archive-migration
main: ./tools/archive-migration
tags:
- static_build
- env:
- CGO_ENABLED=0
goos:
@ -70,7 +57,7 @@ archives:
{{- else }}{{ .Arch }}{{ end }}
{{- if .Arm }}v{{ .Arm }}{{ end }}
checksum:
name_template: 'checksums.txt'
name_template: "checksums.txt"
snapshot:
name_template: "{{ incpatch .Version }}-next"
changelog:
@ -100,7 +87,7 @@ changelog:
release:
draft: false
footer: |
Supports job archive version 1 and database version 6.
Supports job archive version 2 and database version 8.
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md) for further details on breaking changes.
# vim: set ts=2 sw=2 tw=0 fo=cnqoj

View File

@ -2,7 +2,7 @@ TARGET = ./cc-backend
VAR = ./var
CFG = config.json .env
FRONTEND = ./web/frontend
VERSION = 1.3.0
VERSION = 1.4.4
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
@ -22,13 +22,23 @@ SVELTE_COMPONENTS = status \
header
SVELTE_TARGETS = $(addprefix $(FRONTEND)/public/build/,$(addsuffix .js, $(SVELTE_COMPONENTS)))
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
$(wildcard $(FRONTEND)/src/*.js) \
$(wildcard $(FRONTEND)/src/filters/*.svelte) \
$(wildcard $(FRONTEND)/src/plots/*.svelte) \
$(wildcard $(FRONTEND)/src/joblist/*.svelte)
SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
$(wildcard $(FRONTEND)/src/*.js) \
$(wildcard $(FRONTEND)/src/analysis/*.svelte) \
$(wildcard $(FRONTEND)/src/config/*.svelte) \
$(wildcard $(FRONTEND)/src/config/admin/*.svelte) \
$(wildcard $(FRONTEND)/src/config/user/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/*.js) \
$(wildcard $(FRONTEND)/src/generic/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/filters/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/plots/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/joblist/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/helper/*.svelte) \
$(wildcard $(FRONTEND)/src/generic/select/*.svelte) \
$(wildcard $(FRONTEND)/src/header/*.svelte) \
$(wildcard $(FRONTEND)/src/job/*.svelte)
.PHONY: clean distclean test tags frontend $(TARGET)
.PHONY: clean distclean test tags frontend swagger graphql $(TARGET)
.NOTPARALLEL:
@ -40,6 +50,15 @@ frontend:
$(info ===> BUILD frontend)
cd web/frontend && npm install && npm run build
swagger:
$(info ===> GENERATE swagger)
@go run github.com/swaggo/swag/cmd/swag init -d ./internal/api,./pkg/schema -g rest.go -o ./api
@mv ./api/docs.go ./internal/api/docs.go
graphql:
$(info ===> GENERATE graphql)
@go run github.com/99designs/gqlgen
clean:
$(info ===> CLEAN)
@go clean
@ -63,7 +82,7 @@ tags:
@ctags -R
$(VAR):
@mkdir $(VAR)
@mkdir -p $(VAR)
config.json:
$(info ===> Initialize config.json file)

View File

@ -65,7 +65,7 @@ cd ./cc-backend
./startDemo.sh
```
You can also try the demo using the lates release binary.
You can also try the demo using the latest release binary.
Create a folder and put the release binary `cc-backend` into this folder.
Execute the following steps:
@ -88,7 +88,9 @@ Analysis, Systems and Status views).
There is a Makefile to automate the build of cc-backend. The Makefile supports
the following targets:
* `make`: Initialize `var` directory and build svelte frontend and backend binary. Note that there is no proper prerequesite handling. Any change of frontend source files will result in a complete rebuild.
* `make`: Initialize `var` directory and build svelte frontend and backend
binary. Note that there is no proper prerequisite handling. Any change of
frontend source files will result in a complete rebuild.
* `make clean`: Clean go build cache and remove binary.
* `make test`: Run the tests that are also run in the GitHub workflow setup.
@ -147,8 +149,6 @@ contains Go packages that can be used by other projects.
Additional command line helper tools.
* [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
Commands for getting infos about and existing job archive.
* [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
Tool to migrate from previous to current job archive version.
* [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
Tool to convert external pubkey for use in `cc-backend`.
* [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)

View File

@ -1,12 +1,47 @@
# `cc-backend` version 1.3.0
# `cc-backend` version 1.4.4
Supports job archive version 1 and database version 7.
Supports job archive version 2 and database version 8.
This is a minor release of `cc-backend`, the API backend and frontend
This is a bug fix release of `cc-backend`, the API backend and frontend
implementation of ClusterCockpit.
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
## Breaking changes
* This release fixes bugs in the MySQL/MariaDB database schema. For this reason
you have to migrate your database using the `-migrate-db` switch.
The option `apiAllowedIPs` is now a required configuration attribute in
`config.json`. This option restricts access to the admin API.
To retain the previous behavior that the API is per default accessible from
everywhere set:
```json
"apiAllowedIPs": [
"*"
]
```
## Breaking changes for minor release 1.4.x
- You need to perform a database migration. Depending on your database size the
migration might require several hours!
- You need to adapt the `cluster.json` configuration files in the job-archive,
add new required attributes to the metric list and after that edit
`./job-archive/version.txt` to version 2. Only metrics that have the footprint
attribute set can be filtered and show up in the footprint UI and polar plot.
- Continuous scrolling is default now in all job lists. You can change this back
to paging globally, also every user can configure to use paging or continuous
scrolling individually.
- Tags have a scope now. Existing tags will get global scope in the database
migration.
## New features
- Enable to delete tags from the web interface
## Known issues
- Currently energy footprint metrics of type energy are ignored for calculating
total energy.
- Resampling for running jobs only works with cc-metric-store
- With energy footprint metrics of type power the unit is ignored and it is
assumed the metric has the unit Watt.

View File

@ -18,6 +18,7 @@ type Job {
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
energy: Float!
SMT: Int!
exclusive: Int!
partition: String!
@ -27,12 +28,8 @@ type Job {
tags: [Tag!]!
resources: [Resource!]!
concurrentJobs: JobLinkResultList
memUsedMax: Float
flopsAnyAvg: Float
memBwAvg: Float
loadAvg: Float
footprint: [FootprintValue]
energyFootprint: [EnergyFootprintValue]
metaData: Any
userData: User
}
@ -45,7 +42,6 @@ type JobLink {
type Cluster {
name: String!
partitions: [String!]! # Slurm partitions
metricConfig: [MetricConfig!]!
subClusters: [SubCluster!]! # Hardware partitions/subclusters
}
@ -61,9 +57,24 @@ type SubCluster {
flopRateSimd: MetricValue!
memoryBandwidth: MetricValue!
topology: Topology!
metricConfig: [MetricConfig!]!
footprint: [String!]!
}
type FootprintValue {
name: String!
stat: String!
value: Float!
}
type EnergyFootprintValue {
hardware: String!
metric: String!
value: Float!
}
type MetricValue {
name: String
unit: Unit!
value: Float!
}
@ -102,6 +113,7 @@ type MetricConfig {
normal: Float
caution: Float!
alert: Float!
lowerIsBetter: Boolean
subClusters: [SubClusterConfig!]!
}
@ -109,6 +121,7 @@ type Tag {
id: ID!
type: String!
name: String!
scope: String!
}
type Resource {
@ -138,6 +151,43 @@ type Series {
data: [NullableFloat!]!
}
type StatsSeries {
mean: [NullableFloat!]!
median: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type NamedStatsWithScope {
name: String!
scope: MetricScope!
stats: [ScopedStats!]!
}
type ScopedStats {
hostname: String!
id: String
data: MetricStatistics!
}
type JobStats {
id: Int!
jobId: String!
startTime: Int!
duration: Int!
cluster: String!
subCluster: String!
numNodes: Int!
numHWThreads: Int
numAccelerators: Int
stats: [NamedStats!]!
}
type NamedStats {
name: String!
data: MetricStatistics!
}
type Unit {
base: String!
prefix: String
@ -149,12 +199,6 @@ type MetricStatistics {
max: Float!
}
type StatsSeries {
mean: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type MetricFootprints {
metric: String!
data: [NullableFloat!]!
@ -180,6 +224,28 @@ type NodeMetrics {
metrics: [JobMetricWithName!]!
}
type NodesResultList {
items: [NodeMetrics!]!
offset: Int
limit: Int
count: Int
totalNodes: Int
hasNextPage: Boolean
}
type ClusterSupport {
cluster: String!
subClusters: [String!]!
}
type GlobalMetricListItem {
name: String!
unit: Unit!
scope: MetricScope!
footprint: String
availability: [ClusterSupport!]!
}
type Count {
name: String!
count: Int!
@ -191,39 +257,51 @@ type User {
email: String!
}
input MetricStatItem {
metricName: String!
range: FloatRange!
}
type Query {
clusters: [Cluster!]! # List of all clusters
tags: [Tag!]! # List of all tags
globalMetrics: [GlobalMetricListItem!]!
user(username: String!): User
allocatedNodes(cluster: String!): [Count!]!
job(id: ID!): Job
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!], resolution: Int): [JobMetricWithName!]!
jobStats(id: ID!, metrics: [String!]): [NamedStats!]!
scopedJobStats(id: ID!, metrics: [String!], scopes: [MetricScope!]): [NamedStatsWithScope!]!
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate): [JobsStatistics!]!
jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate, numDurationBins: String, numMetricBins: Int): [JobsStatistics!]!
jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]!
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
nodeMetricsList(cluster: String!, subCluster: String!, nodeFilter: String!, scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!, page: PageRequest, resolution: Int): NodesResultList!
}
type Mutation {
createTag(type: String!, name: String!): Tag!
createTag(type: String!, name: String!, scope: String!): Tag!
deleteTag(id: ID!): ID!
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagFromList(tagIds: [ID!]!): [Int!]!
updateConfiguration(name: String!, value: String!): String
}
type IntRangeOutput { from: Int!, to: Int! }
type TimeRangeOutput { from: Time!, to: Time! }
type TimeRangeOutput { range: String, from: Time!, to: Time! }
input JobFilter {
tags: [ID!]
dbId: [ID!]
jobId: StringInput
arrayJobId: Int
user: StringInput
@ -232,6 +310,7 @@ input JobFilter {
cluster: StringInput
partition: StringInput
duration: IntRange
energy: FloatRange
minRunningFor: Int
@ -241,17 +320,14 @@ input JobFilter {
startTime: TimeRange
state: [JobState!]
flopsAnyAvg: FloatRange
memBwAvg: FloatRange
loadAvg: FloatRange
memUsedMax: FloatRange
metricStats: [MetricStatItem!]
exclusive: Int
node: StringInput
}
input OrderByInput {
field: String!
type: String!,
order: SortDirectionEnum! = ASC
}
@ -269,9 +345,13 @@ input StringInput {
in: [String!]
}
input IntRange { from: Int!, to: Int! }
input FloatRange { from: Float!, to: Float! }
input TimeRange { from: Time, to: Time }
input IntRange { from: Int!, to: Int! }
input TimeRange { range: String, from: Time, to: Time }
input FloatRange {
from: Float!
to: Float!
}
type JobResultList {
items: [Job!]!
@ -295,6 +375,7 @@ type HistoPoint {
type MetricHistoPoints {
metric: String!
unit: String!
stat: String
data: [MetricHistoPoint!]
}

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,3 @@
basePath: /api
definitions:
api.ApiReturnedUser:
properties:
@ -23,11 +22,20 @@ definitions:
description: Tag Name
example: Testjob
type: string
scope:
description: Tag Scope for Frontend Display
example: global
type: string
type:
description: Tag Type
example: Debug
type: string
type: object
api.DefaultJobApiResponse:
properties:
msg:
type: string
type: object
api.DeleteJobApiRequest:
properties:
cluster:
@ -45,11 +53,6 @@ definitions:
required:
- jobId
type: object
api.DeleteJobApiResponse:
properties:
msg:
type: string
type: object
api.EditMetaRequest:
properties:
key:
@ -108,33 +111,22 @@ definitions:
scope:
$ref: '#/definitions/schema.MetricScope'
type: object
api.StartJobApiResponse:
properties:
id:
description: Database ID of new job
type: integer
type: object
api.StopJobApiRequest:
properties:
cluster:
description: Cluster of job
example: fritz
type: string
jobId:
description: Cluster Job ID of job
example: 123000
type: integer
jobState:
allOf:
- $ref: '#/definitions/schema.JobState'
description: Final job state
example: completed
startTime:
description: Start Time of job as epoch
example: 1649723812
type: integer
stopTime:
description: Stop Time of job as epoch
example: 1649763839
type: integer
required:
@ -167,42 +159,40 @@ definitions:
description: Information of a HPC job.
properties:
arrayJobId:
description: The unique identifier of an array job
example: 123000
type: integer
cluster:
description: The unique identifier of a cluster
example: fritz
type: string
concurrentJobs:
$ref: '#/definitions/schema.JobLinkResultList'
duration:
description: Duration of job in seconds (Min > 0)
example: 43200
minimum: 1
type: integer
energy:
type: number
energyFootprint:
additionalProperties:
type: number
type: object
exclusive:
description: 'Specifies how nodes are shared: 0 - Shared among multiple jobs
of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple
jobs of same user'
example: 1
maximum: 2
minimum: 0
type: integer
flopsAnyAvg:
description: FlopsAnyAvg as Float64
type: number
footprint:
additionalProperties:
type: number
type: object
id:
description: The unique identifier of a job in the database
type: integer
jobId:
description: The unique identifier of a job
example: 123000
type: integer
jobState:
allOf:
- $ref: '#/definitions/schema.JobState'
description: Final state of job
enum:
- completed
- failed
@ -211,79 +201,53 @@ definitions:
- timeout
- out_of_memory
example: completed
loadAvg:
description: LoadAvg as Float64
type: number
memBwAvg:
description: MemBwAvg as Float64
type: number
memUsedMax:
description: MemUsedMax as Float64
type: number
metaData:
additionalProperties:
type: string
description: Additional information about the job
type: object
monitoringStatus:
description: 'State of monitoring system during job run: 0 - Disabled, 1 -
Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull'
example: 1
maximum: 3
minimum: 0
type: integer
numAcc:
description: Number of accelerators used (Min > 0)
example: 2
minimum: 1
type: integer
numHwthreads:
description: NumCores int32 `json:"numCores" db:"num_cores"
example:"20" minimum:"1"` //
Number of HWThreads used (Min > 0)
example: 20
minimum: 1
type: integer
numNodes:
description: Number of nodes used (Min > 0)
example: 2
minimum: 1
type: integer
partition:
description: The Slurm partition to which the job was submitted
example: main
type: string
project:
description: The unique identifier of a project
example: abcd200
type: string
resources:
description: Resources used by job
items:
$ref: '#/definitions/schema.Resource'
type: array
smt:
description: SMT threads used by job
example: 4
type: integer
startTime:
description: Start time as 'time.Time' data type
type: string
subCluster:
description: The unique identifier of a sub cluster
example: main
type: string
tags:
description: List of tags
items:
$ref: '#/definitions/schema.Tag'
type: array
user:
description: The unique identifier of a user
example: abcd100h
type: string
walltime:
description: Requested walltime of job in seconds (Min > 0)
example: 86400
minimum: 1
type: integer
@ -308,39 +272,40 @@ definitions:
description: Meta data information of a HPC job.
properties:
arrayJobId:
description: The unique identifier of an array job
example: 123000
type: integer
cluster:
description: The unique identifier of a cluster
example: fritz
type: string
concurrentJobs:
$ref: '#/definitions/schema.JobLinkResultList'
duration:
description: Duration of job in seconds (Min > 0)
example: 43200
minimum: 1
type: integer
energy:
type: number
energyFootprint:
additionalProperties:
type: number
type: object
exclusive:
description: 'Specifies how nodes are shared: 0 - Shared among multiple jobs
of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple
jobs of same user'
example: 1
maximum: 2
minimum: 0
type: integer
footprint:
additionalProperties:
type: number
type: object
id:
description: The unique identifier of a job in the database
type: integer
jobId:
description: The unique identifier of a job
example: 123000
type: integer
jobState:
allOf:
- $ref: '#/definitions/schema.JobState'
description: Final state of job
enum:
- completed
- failed
@ -352,74 +317,56 @@ definitions:
metaData:
additionalProperties:
type: string
description: Additional information about the job
type: object
monitoringStatus:
description: 'State of monitoring system during job run: 0 - Disabled, 1 -
Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull'
example: 1
maximum: 3
minimum: 0
type: integer
numAcc:
description: Number of accelerators used (Min > 0)
example: 2
minimum: 1
type: integer
numHwthreads:
description: NumCores int32 `json:"numCores" db:"num_cores"
example:"20" minimum:"1"` //
Number of HWThreads used (Min > 0)
example: 20
minimum: 1
type: integer
numNodes:
description: Number of nodes used (Min > 0)
example: 2
minimum: 1
type: integer
partition:
description: The Slurm partition to which the job was submitted
example: main
type: string
project:
description: The unique identifier of a project
example: abcd200
type: string
resources:
description: Resources used by job
items:
$ref: '#/definitions/schema.Resource'
type: array
smt:
description: SMT threads used by job
example: 4
type: integer
startTime:
description: Start epoch time stamp in seconds (Min > 0)
example: 1649723812
minimum: 1
type: integer
statistics:
additionalProperties:
$ref: '#/definitions/schema.JobStatistics'
description: Metric statistics of job
type: object
subCluster:
description: The unique identifier of a sub cluster
example: main
type: string
tags:
description: List of tags
items:
$ref: '#/definitions/schema.Tag'
type: array
user:
description: The unique identifier of a user
example: abcd100h
type: string
walltime:
description: Requested walltime of job in seconds (Min > 0)
example: 86400
minimum: 1
type: integer
@ -486,6 +433,12 @@ definitions:
type: number
caution:
type: number
energy:
type: string
footprint:
type: string
lowerIsBetter:
type: boolean
name:
type: string
normal:
@ -541,18 +494,14 @@ definitions:
description: A resource used by a job
properties:
accelerators:
description: List of of accelerator device ids
items:
type: string
type: array
configuration:
description: The configuration options of the node
type: string
hostname:
description: Name of the host (= node)
type: string
hwthreads:
description: List of OS processor ids
items:
type: integer
type: array
@ -580,6 +529,10 @@ definitions:
items:
type: number
type: array
median:
items:
type: number
type: array
min:
items:
type: number
@ -595,12 +548,24 @@ definitions:
properties:
coresPerSocket:
type: integer
energyFootprint:
items:
type: string
type: array
flopRateScalar:
$ref: '#/definitions/schema.MetricValue'
flopRateSimd:
$ref: '#/definitions/schema.MetricValue'
footprint:
items:
type: string
type: array
memoryBandwidth:
$ref: '#/definitions/schema.MetricValue'
metricConfig:
items:
$ref: '#/definitions/schema.MetricConfig'
type: array
name:
type: string
nodes:
@ -620,6 +585,12 @@ definitions:
type: number
caution:
type: number
energy:
type: string
footprint:
type: string
lowerIsBetter:
type: boolean
name:
type: string
normal:
@ -633,14 +604,14 @@ definitions:
description: Defines a tag using name and type.
properties:
id:
description: The unique DB identifier of a tag
type: integer
name:
description: Tag Name
example: Testjob
type: string
scope:
example: global
type: string
type:
description: Tag Type
example: Debug
type: string
type: object
@ -699,7 +670,7 @@ info:
title: ClusterCockpit REST API
version: 1.0.0
paths:
/clusters/:
/api/clusters/:
get:
description: Get a list of all cluster configs. Specific cluster can be requested
using query parameter.
@ -736,7 +707,7 @@ paths:
summary: Lists all cluster configs
tags:
- Cluster query
/jobs/:
/api/jobs/:
get:
description: |-
Get a list of all jobs. Filters can be applied using query parameters.
@ -801,7 +772,7 @@ paths:
summary: Lists all jobs
tags:
- Job query
/jobs/{id}:
/api/jobs/{id}:
get:
description: |-
Job to get is specified by database ID
@ -910,7 +881,7 @@ paths:
summary: Get job meta and configurable metric data
tags:
- Job query
/jobs/delete_job/:
/api/jobs/delete_job/:
delete:
consumes:
- application/json
@ -929,7 +900,7 @@ paths:
"200":
description: Success message
schema:
$ref: '#/definitions/api.DeleteJobApiResponse'
$ref: '#/definitions/api.DefaultJobApiResponse'
"400":
description: Bad Request
schema:
@ -960,7 +931,7 @@ paths:
summary: Remove a job from the sql database
tags:
- Job remove
/jobs/delete_job/{id}:
/api/jobs/delete_job/{id}:
delete:
description: Job to remove is specified by database ID. This will not remove
the job from the job archive.
@ -976,7 +947,7 @@ paths:
"200":
description: Success message
schema:
$ref: '#/definitions/api.DeleteJobApiResponse'
$ref: '#/definitions/api.DefaultJobApiResponse'
"400":
description: Bad Request
schema:
@ -1007,7 +978,7 @@ paths:
summary: Remove a job from the sql database
tags:
- Job remove
/jobs/delete_job_before/{ts}:
/api/jobs/delete_job_before/{ts}:
delete:
description: Remove all jobs with start time before timestamp. The jobs will
not be removed from the job archive.
@ -1023,7 +994,7 @@ paths:
"200":
description: Success message
schema:
$ref: '#/definitions/api.DeleteJobApiResponse'
$ref: '#/definitions/api.DefaultJobApiResponse'
"400":
description: Bad Request
schema:
@ -1054,7 +1025,7 @@ paths:
summary: Remove a job from the sql database
tags:
- Job remove
/jobs/edit_meta/{id}:
/api/jobs/edit_meta/{id}:
post:
consumes:
- application/json
@ -1101,7 +1072,7 @@ paths:
summary: Edit meta-data json
tags:
- Job add and modify
/jobs/start_job/:
/api/jobs/start_job/:
post:
consumes:
- application/json
@ -1121,7 +1092,7 @@ paths:
"201":
description: Job added successfully
schema:
$ref: '#/definitions/api.StartJobApiResponse'
$ref: '#/definitions/api.DefaultJobApiResponse'
"400":
description: Bad Request
schema:
@ -1148,7 +1119,7 @@ paths:
summary: Adds a new job as "running"
tags:
- Job add and modify
/jobs/stop_job/:
/api/jobs/stop_job/:
post:
description: |-
Job to stop is specified by request body. All fields are required in this case.
@ -1184,8 +1155,7 @@ paths:
schema:
$ref: '#/definitions/api.ErrorResponse'
"422":
description: 'Unprocessable Entity: finding job failed: sql: no rows in
result set'
description: 'Unprocessable Entity: job has already been stopped'
schema:
$ref: '#/definitions/api.ErrorResponse'
"500":
@ -1197,68 +1167,13 @@ paths:
summary: Marks job as completed and triggers archiving
tags:
- Job add and modify
/jobs/stop_job/{id}:
post:
consumes:
- application/json
description: |-
Job to stop is specified by database ID. Only stopTime and final state are required in request body.
Returns full job resource information according to 'JobMeta' scheme.
parameters:
- description: Database ID of Job
in: path
name: id
required: true
type: integer
- description: stopTime and final state in request body
in: body
name: request
required: true
schema:
$ref: '#/definitions/api.StopJobApiRequest'
produces:
- application/json
responses:
"200":
description: Job resource
schema:
$ref: '#/definitions/schema.JobMeta'
"400":
description: Bad Request
schema:
$ref: '#/definitions/api.ErrorResponse'
"401":
description: Unauthorized
schema:
$ref: '#/definitions/api.ErrorResponse'
"403":
description: Forbidden
schema:
$ref: '#/definitions/api.ErrorResponse'
"404":
description: Resource not found
schema:
$ref: '#/definitions/api.ErrorResponse'
"422":
description: 'Unprocessable Entity: finding job failed: sql: no rows in
result set'
schema:
$ref: '#/definitions/api.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/api.ErrorResponse'
security:
- ApiKeyAuth: []
summary: Marks job as completed and triggers archiving
tags:
- Job add and modify
/jobs/tag_job/{id}:
/api/jobs/tag_job/{id}:
post:
consumes:
- application/json
description: |-
Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.
Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username.
If tagged job is already finished: Tag will be written directly to respective archive files.
parameters:
- description: Job Database ID
@ -1302,128 +1217,11 @@ paths:
summary: Adds one or more tags to a job
tags:
- Job add and modify
/user/{id}:
post:
consumes:
- multipart/form-data
description: |-
Modifies user defined by username (id) in one of four possible ways.
If more than one formValue is set then only the highest priority field is used.
Only accessible from IPs registered with apiAllowedIPs configuration option.
parameters:
- description: Database ID of User
in: path
name: id
required: true
type: string
- description: 'Priority 1: Role to add'
enum:
- admin
- support
- manager
- user
- api
in: formData
name: add-role
type: string
- description: 'Priority 2: Role to remove'
enum:
- admin
- support
- manager
- user
- api
in: formData
name: remove-role
type: string
- description: 'Priority 3: Project to add'
in: formData
name: add-project
type: string
- description: 'Priority 4: Project to remove'
in: formData
name: remove-project
type: string
produces:
- text/plain
responses:
"200":
description: Success Response Message
schema:
type: string
"400":
description: Bad Request
schema:
type: string
"401":
description: Unauthorized
schema:
type: string
"403":
description: Forbidden
schema:
type: string
"422":
description: 'Unprocessable Entity: The user could not be updated'
schema:
type: string
"500":
description: Internal Server Error
schema:
type: string
security:
- ApiKeyAuth: []
summary: Updates an existing user
tags:
- User
/users/:
delete:
consumes:
- multipart/form-data
description: |-
User defined by username in form data will be deleted from database.
Only accessible from IPs registered with apiAllowedIPs configuration option.
parameters:
- description: User ID to delete
in: formData
name: username
required: true
type: string
produces:
- text/plain
responses:
"200":
description: User deleted successfully
"400":
description: Bad Request
schema:
type: string
"401":
description: Unauthorized
schema:
type: string
"403":
description: Forbidden
schema:
type: string
"422":
description: 'Unprocessable Entity: deleting user failed'
schema:
type: string
"500":
description: Internal Server Error
schema:
type: string
security:
- ApiKeyAuth: []
summary: Deletes a user
tags:
- User
/api/users/:
get:
description: |-
Returns a JSON-encoded list of users.
Required query-parameter defines if all users or only users with additional special roles are returned.
Only accessible from IPs registered with apiAllowedIPs configuration option.
parameters:
- description: If returned list should contain all users or only users with
additional special roles
@ -1461,46 +1259,73 @@ paths:
summary: Returns a list of users
tags:
- User
post:
/jobs/tag_job/{id}:
delete:
consumes:
- multipart/form-data
- application/json
description: |-
User specified in form data will be saved to database.
Only accessible from IPs registered with apiAllowedIPs configuration option.
Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match.
Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API.
If tagged job is already finished: Tag will be removed from respective archive files.
parameters:
- description: Unique user ID
in: formData
name: username
- description: Job Database ID
in: path
name: id
required: true
type: string
- description: User password
in: formData
name: password
type: integer
- description: Array of tag-objects to remove
in: body
name: request
required: true
type: string
- description: User role
enum:
- admin
- support
- manager
- user
- api
in: formData
name: role
schema:
items:
$ref: '#/definitions/api.ApiTag'
type: array
produces:
- application/json
responses:
"200":
description: Updated job resource
schema:
$ref: '#/definitions/schema.Job'
"400":
description: Bad Request
schema:
$ref: '#/definitions/api.ErrorResponse'
"401":
description: Unauthorized
schema:
$ref: '#/definitions/api.ErrorResponse'
"404":
description: Job or tag does not exist
schema:
$ref: '#/definitions/api.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/api.ErrorResponse'
security:
- ApiKeyAuth: []
summary: Removes one or more tags from a job
tags:
- Job add and modify
/tags/:
delete:
consumes:
- application/json
description: |-
Removes tags by type and name. Name and Type of Tag(s) must match.
Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API.
Tag wills be removed from respective archive files.
parameters:
- description: Array of tag-objects to remove
in: body
name: request
required: true
type: string
- description: Managed project, required for new manager role user
in: formData
name: project
type: string
- description: Users name
in: formData
name: name
type: string
- description: Users email
in: formData
name: email
type: string
schema:
items:
$ref: '#/definitions/api.ApiTag'
type: array
produces:
- text/plain
responses:
@ -1511,28 +1336,24 @@ paths:
"400":
description: Bad Request
schema:
type: string
$ref: '#/definitions/api.ErrorResponse'
"401":
description: Unauthorized
schema:
type: string
"403":
description: Forbidden
$ref: '#/definitions/api.ErrorResponse'
"404":
description: Job or tag does not exist
schema:
type: string
"422":
description: 'Unprocessable Entity: creating user failed'
schema:
type: string
$ref: '#/definitions/api.ErrorResponse'
"500":
description: Internal Server Error
schema:
type: string
$ref: '#/definitions/api.ErrorResponse'
security:
- ApiKeyAuth: []
summary: Adds a new user
summary: Removes all tags and job-relations for type:name tuple
tags:
- User
- Tag remove
securityDefinitions:
ApiKeyAuth:
in: header

33
cmd/cc-backend/cli.go Normal file
View File

@ -0,0 +1,33 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import "flag"
var (
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
)
func cliInit() {
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize sqlite database file, config.json and .env")
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'hpc_user' table with ldap")
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
flag.BoolVar(&flagGops, "gops", false, "Listen via github.com/google/gops/agent (for debugging)")
flag.BoolVar(&flagDev, "dev", false, "Enable development components: GraphQL Playground and Swagger UI")
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")
flag.StringVar(&flagDelUser, "del-user", "", "Remove a existing user. Argument format: <username>")
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info (default), warn, err, crit]`")
flag.Parse()
}

95
cmd/cc-backend/init.go Normal file
View File

@ -0,0 +1,95 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"os"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/util"
"github.com/ClusterCockpit/cc-backend/pkg/log"
)
const envString = `
# Base64 encoded Ed25519 keys (DO NOT USE THESE TWO IN PRODUCTION!)
# You can generate your own keypair using the gen-keypair tool
JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
# Some random bytes used as secret for cookie-based sessions (DO NOT USE THIS ONE IN PRODUCTION)
SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
`
const configString = `
{
"addr": "127.0.0.1:8080",
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"jwts": {
"max-age": "2000h"
},
"apiAllowedIPs": [
"*"
],
"enable-resampling": {
"trigger": 30,
"resolutions": [
600,
300,
120,
60
]
},
"clusters": [
{
"name": "name",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2023-01-01T00:00:00Z",
"to": null
}
}
}
]
}
`
func initEnv() {
if util.CheckFileExists("var") {
log.Exit("Directory ./var already exists. Cautiously exiting application initialization.")
}
if err := os.WriteFile("config.json", []byte(configString), 0o666); err != nil {
log.Abortf("Could not write default ./config.json with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
if err := os.WriteFile(".env", []byte(envString), 0o666); err != nil {
log.Abortf("Could not write default ./.env file with permissions '0o666'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
if err := os.Mkdir("var", 0o777); err != nil {
log.Abortf("Could not create default ./var folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
err := repository.MigrateDB("sqlite3", "./var/job.db")
if err != nil {
log.Abortf("Could not initialize default sqlite3 database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error())
}
}

View File

@ -5,155 +5,49 @@
package main
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
"net"
"net/http"
"os"
"os/signal"
"runtime"
"runtime/debug"
"strings"
"sync"
"syscall"
"time"
"github.com/99designs/gqlgen/graphql/handler"
"github.com/99designs/gqlgen/graphql/playground"
"github.com/ClusterCockpit/cc-backend/internal/api"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
"github.com/ClusterCockpit/cc-backend/internal/runtimeEnv"
"github.com/ClusterCockpit/cc-backend/internal/util"
"github.com/ClusterCockpit/cc-backend/internal/taskManager"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/web"
"github.com/go-co-op/gocron"
"github.com/google/gops/agent"
"github.com/gorilla/handlers"
"github.com/gorilla/mux"
httpSwagger "github.com/swaggo/http-swagger"
"github.com/joho/godotenv"
_ "github.com/go-sql-driver/mysql"
_ "github.com/mattn/go-sqlite3"
)
const logoString = `
____ _ _ ____ _ _ _
/ ___| |_ _ ___| |_ ___ _ __ / ___|___ ___| | ___ __ (_) |_
_____ _ _ ____ _ _ _
/ ___| |_ _ ___| |_ ___ _ __ / ___|___ ___| | ___ __ (_) |_
| | | | | | / __| __/ _ \ '__| | / _ \ / __| |/ / '_ \| | __|
| |___| | |_| \__ \ || __/ | | |__| (_) | (__| <| |_) | | |_
\____|_|\__,_|___/\__\___|_| \____\___/ \___|_|\_\ .__/|_|\__|
\_____|_|\__,_|___/\__\___|_| \____\___/ \___|_|\_\ .__/|_|\__|
|_|
`
const envString = `
# Base64 encoded Ed25519 keys (DO NOT USE THESE TWO IN PRODUCTION!)
# You can generate your own keypair using the gen-keypair tool
JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
# Some random bytes used as secret for cookie-based sessions (DO NOT USE THIS ONE IN PRODUCTION)
SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
`
const configString = `
{
"addr": "127.0.0.1:8080",
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"clusters": [
{
"name": "name",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2023-01-01T00:00:00Z",
"to": null
}
}
}
]
}
`
var (
date string
commit string
version string
)
func initEnv() {
if util.CheckFileExists("var") {
fmt.Print("Directory ./var already exists. Exiting!\n")
os.Exit(0)
}
if err := os.WriteFile("config.json", []byte(configString), 0666); err != nil {
log.Fatalf("Writing config.json failed: %s", err.Error())
}
if err := os.WriteFile(".env", []byte(envString), 0666); err != nil {
log.Fatalf("Writing .env failed: %s", err.Error())
}
if err := os.Mkdir("var", 0777); err != nil {
log.Fatalf("Mkdir var failed: %s", err.Error())
}
err := repository.MigrateDB("sqlite3", "./var/job.db")
if err != nil {
log.Fatalf("Initialize job.db failed: %s", err.Error())
}
}
func main() {
var flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool
var flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
flag.BoolVar(&flagInit, "init", false, "Setup var directory, initialize swlite database file, config.json and .env")
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize the 'job', 'tag', and 'jobtag' tables (all running jobs will be lost!)")
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the 'user' table with ldap")
flag.BoolVar(&flagServer, "server", false, "Start a server, continues listening on port after initialization and argument handling")
flag.BoolVar(&flagGops, "gops", false, "Listen via github.com/google/gops/agent (for debugging)")
flag.BoolVar(&flagDev, "dev", false, "Enable development components: GraphQL Playground and Swagger UI")
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: `<username>:[admin,support,manager,api,user]:<password>`")
flag.StringVar(&flagDelUser, "del-user", "", "Remove user by `username`")
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug,info,warn (default),err,fatal,crit]`")
flag.Parse()
cliInit()
if flagVersion {
fmt.Print(logoString)
@ -168,23 +62,24 @@ func main() {
// Apply config flags for pkg/log
log.Init(flagLogLevel, flagLogDateTime)
// If init flag set, run tasks here before any file dependencies cause errors
if flagInit {
initEnv()
fmt.Print("Succesfully setup environment!\n")
fmt.Print("Please review config.json and .env and adjust it to your needs.\n")
fmt.Print("Add your job-archive at ./var/job-archive.\n")
os.Exit(0)
log.Exit("Successfully setup environment!\n" +
"Please review config.json and .env and adjust it to your needs.\n" +
"Add your job-archive at ./var/job-archive.")
}
// See https://github.com/google/gops (Runtime overhead is almost zero)
if flagGops {
if err := agent.Listen(agent.Options{}); err != nil {
log.Fatalf("gops/agent.Listen failed: %s", err.Error())
log.Abortf("Could not start gops agent with 'gops/agent.Listen(agent.Options{})'. Application startup failed, exited.\nError: %s\n", err.Error())
}
}
if err := runtimeEnv.LoadEnv("./.env"); err != nil && !os.IsNotExist(err) {
log.Fatalf("parsing './.env' file failed: %s", err.Error())
err := godotenv.Load()
if err != nil {
log.Abortf("Could not parse existing .env file at location './.env'. Application startup failed, exited.\nError: %s\n", err.Error())
}
// Initialize sub-modules and handle command line flags.
@ -202,341 +97,134 @@ func main() {
if flagMigrateDB {
err := repository.MigrateDB(config.Keys.DBDriver, config.Keys.DB)
if err != nil {
log.Fatal(err)
log.Abortf("MigrateDB Failed: Could not migrate '%s' database at location '%s' to version %d.\nError: %s\n", config.Keys.DBDriver, config.Keys.DB, repository.Version, err.Error())
}
os.Exit(0)
log.Exitf("MigrateDB Success: Migrated '%s' database at location '%s' to version %d.\n", config.Keys.DBDriver, config.Keys.DB, repository.Version)
}
if flagRevertDB {
err := repository.RevertDB(config.Keys.DBDriver, config.Keys.DB)
if err != nil {
log.Fatal(err)
log.Abortf("RevertDB Failed: Could not revert '%s' database at location '%s' to version %d.\nError: %s\n", config.Keys.DBDriver, config.Keys.DB, (repository.Version - 1), err.Error())
}
os.Exit(0)
log.Exitf("RevertDB Success: Reverted '%s' database at location '%s' to version %d.\n", config.Keys.DBDriver, config.Keys.DB, (repository.Version - 1))
}
if flagForceDB {
err := repository.ForceDB(config.Keys.DBDriver, config.Keys.DB)
if err != nil {
log.Fatal(err)
log.Abortf("ForceDB Failed: Could not force '%s' database at location '%s' to version %d.\nError: %s\n", config.Keys.DBDriver, config.Keys.DB, repository.Version, err.Error())
}
os.Exit(0)
log.Exitf("ForceDB Success: Forced '%s' database at location '%s' to version %d.\n", config.Keys.DBDriver, config.Keys.DB, repository.Version)
}
repository.Connect(config.Keys.DBDriver, config.Keys.DB)
db := repository.GetConnection()
var authentication *auth.Authentication
if !config.Keys.DisableAuthentication {
var err error
if authentication, err = auth.Init(); err != nil {
log.Fatalf("auth initialization failed: %v", err)
}
if d, err := time.ParseDuration(config.Keys.SessionMaxAge); err != nil {
authentication.SessionMaxAge = d
}
auth.Init()
if flagNewUser != "" {
parts := strings.SplitN(flagNewUser, ":", 3)
if len(parts) != 3 || len(parts[0]) == 0 {
log.Fatal("invalid argument format for user creation")
log.Abortf("Add User: Could not parse supplied argument format: No changes.\n"+
"Want: <username>:[admin,support,manager,api,user]:<password>\n"+
"Have: %s\n", flagNewUser)
}
ur := repository.GetUserRepository()
if err := ur.AddUser(&schema.User{
Username: parts[0], Projects: make([]string, 0), Password: parts[2], Roles: strings.Split(parts[1], ","),
}); err != nil {
log.Fatalf("adding '%s' user authentication failed: %v", parts[0], err)
log.Abortf("Add User: Could not add new user authentication for '%s' and roles '%s'.\nError: %s\n", parts[0], parts[1], err.Error())
} else {
log.Printf("Add User: Added new user '%s' with roles '%s'.\n", parts[0], parts[1])
}
}
if flagDelUser != "" {
ur := repository.GetUserRepository()
if err := ur.DelUser(flagDelUser); err != nil {
log.Fatalf("deleting user failed: %v", err)
log.Abortf("Delete User: Could not delete user '%s' from DB.\nError: %s\n", flagDelUser, err.Error())
} else {
log.Printf("Delete User: Deleted user '%s' from DB.\n", flagDelUser)
}
}
authHandle := auth.GetAuthInstance()
if flagSyncLDAP {
if authentication.LdapAuth == nil {
log.Fatal("cannot sync: LDAP authentication is not configured")
if authHandle.LdapAuth == nil {
log.Abort("Sync LDAP: LDAP authentication is not configured, could not synchronize. No changes, exited.")
}
if err := authentication.LdapAuth.Sync(); err != nil {
log.Fatalf("LDAP sync failed: %v", err)
if err := authHandle.LdapAuth.Sync(); err != nil {
log.Abortf("Sync LDAP: Could not synchronize, failed with error.\nError: %s\n", err.Error())
}
log.Info("LDAP sync successfull")
log.Print("Sync LDAP: LDAP synchronization successfull.")
}
if flagGenJWT != "" {
ur := repository.GetUserRepository()
user, err := ur.GetUser(flagGenJWT)
if err != nil {
log.Fatalf("could not get user from JWT: %v", err)
log.Abortf("JWT: Could not get supplied user '%s' from DB. No changes, exited.\nError: %s\n", flagGenJWT, err.Error())
}
if !user.HasRole(schema.RoleApi) {
log.Warnf("user '%s' does not have the API role", user.Username)
log.Warnf("JWT: User '%s' does not have the role 'api'. REST API endpoints will return error!\n", user.Username)
}
jwt, err := authentication.JwtAuth.ProvideJWT(user)
jwt, err := authHandle.JwtAuth.ProvideJWT(user)
if err != nil {
log.Fatalf("failed to provide JWT to user '%s': %v", user.Username, err)
log.Abortf("JWT: User '%s' found in DB, but failed to provide JWT.\nError: %s\n", user.Username, err.Error())
}
fmt.Printf("MAIN > JWT for '%s': %s\n", user.Username, jwt)
log.Printf("JWT: Successfully generated JWT for user '%s': %s\n", user.Username, jwt)
}
} else if flagNewUser != "" || flagDelUser != "" {
log.Fatal("arguments --add-user and --del-user can only be used if authentication is enabled")
log.Abort("Error: Arguments '--add-user' and '--del-user' can only be used if authentication is enabled. No changes, exited.")
}
if err := archive.Init(config.Keys.Archive, config.Keys.DisableArchive); err != nil {
log.Fatalf("failed to initialize archive: %s", err.Error())
log.Abortf("Init: Failed to initialize archive.\nError: %s\n", err.Error())
}
if err := metricdata.Init(config.Keys.DisableArchive); err != nil {
log.Fatalf("failed to initialize metricdata repository: %s", err.Error())
if err := metricdata.Init(); err != nil {
log.Abortf("Init: Failed to initialize metricdata repository.\nError %s\n", err.Error())
}
if flagReinitDB {
if err := importer.InitDB(); err != nil {
log.Fatalf("failed to re-initialize repository DB: %s", err.Error())
log.Abortf("Init DB: Failed to re-initialize repository DB.\nError: %s\n", err.Error())
} else {
log.Print("Init DB: Sucessfully re-initialized repository DB.")
}
}
if flagImportJob != "" {
if err := importer.HandleImportFlag(flagImportJob); err != nil {
log.Fatalf("job import failed: %s", err.Error())
log.Abortf("Import Job: Job import failed.\nError: %s\n", err.Error())
} else {
log.Printf("Import Job: Imported Job '%s' into DB.\n", flagImportJob)
}
}
if !flagServer {
return
log.Exit("No errors, server flag not set. Exiting cc-backend.")
}
// Setup the http.Handler/Router used by the server
jobRepo := repository.GetJobRepository()
resolver := &graph.Resolver{DB: db.DB, Repo: jobRepo}
graphQLEndpoint := handler.NewDefaultServer(generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
if os.Getenv("DEBUG") != "1" {
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
// The problem with this is that then, no more stacktrace is printed to stderr.
graphQLEndpoint.SetRecoverFunc(func(ctx context.Context, err interface{}) error {
switch e := err.(type) {
case string:
return fmt.Errorf("MAIN > Panic: %s", e)
case error:
return fmt.Errorf("MAIN > Panic caused by: %w", e)
}
return errors.New("MAIN > Internal server error (panic)")
})
}
api := &api.RestApi{
JobRepository: jobRepo,
Resolver: resolver,
MachineStateDir: config.Keys.MachineStateDir,
Authentication: authentication,
}
r := mux.NewRouter()
buildInfo := web.Build{Version: version, Hash: commit, Buildtime: date}
info := map[string]interface{}{}
info["hasOpenIDConnect"] = false
if config.Keys.OpenIDConfig != nil {
openIDConnect := auth.NewOIDC(authentication)
openIDConnect.RegisterEndpoints(r)
info["hasOpenIDConnect"] = true
}
r.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
log.Debugf("##%v##", info)
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
}).Methods(http.MethodGet)
r.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
})
r.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
})
secured := r.PathPrefix("/").Subrouter()
if !config.Keys.DisableAuthentication {
r.Handle("/login", authentication.Login(
// On success:
http.RedirectHandler("/", http.StatusTemporaryRedirect),
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Login failed - ClusterCockpit",
MsgType: "alert-warning",
Message: err.Error(),
Build: buildInfo,
Infos: info,
})
})).Methods(http.MethodPost)
r.Handle("/jwt-login", authentication.Login(
// On success:
http.RedirectHandler("/", http.StatusTemporaryRedirect),
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Login failed - ClusterCockpit",
MsgType: "alert-warning",
Message: err.Error(),
Build: buildInfo,
Infos: info,
})
}))
r.Handle("/logout", authentication.Logout(
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusOK)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Bye - ClusterCockpit",
MsgType: "alert-info",
Message: "Logout successful",
Build: buildInfo,
Infos: info,
})
}))).Methods(http.MethodPost)
secured.Use(func(next http.Handler) http.Handler {
return authentication.Auth(
// On success;
next,
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Authentication failed - ClusterCockpit",
MsgType: "alert-danger",
Message: err.Error(),
Build: buildInfo,
Infos: info,
})
})
})
}
if flagDev {
r.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
r.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
}
secured.Handle("/query", graphQLEndpoint)
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
routerConfig.HandleSearchBar(rw, r, buildInfo)
})
// Mount all /monitoring/... and /api/... routes.
routerConfig.SetupRoutes(secured, buildInfo)
api.MountRoutes(secured)
if config.Keys.EmbedStaticFiles {
if i, err := os.Stat("./var/img"); err == nil {
if i.IsDir() {
log.Info("Use local directory for static images")
r.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
}
}
r.PathPrefix("/").Handler(web.ServeFiles())
} else {
r.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
}
r.Use(handlers.CompressHandler)
r.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
r.Use(handlers.CORS(
handlers.AllowCredentials(),
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
handlers.AllowedOrigins([]string{"*"})))
handler := handlers.CustomLoggingHandler(io.Discard, r, func(_ io.Writer, params handlers.LogFormatterParams) {
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
log.Debugf("%s %s (%d, %.02fkb, %dms)",
params.Request.Method, params.URL.RequestURI(),
params.StatusCode, float32(params.Size)/1024,
time.Since(params.TimeStamp).Milliseconds())
} else {
log.Debugf("%s %s (%d, %.02fkb, %dms)",
params.Request.Method, params.URL.RequestURI(),
params.StatusCode, float32(params.Size)/1024,
time.Since(params.TimeStamp).Milliseconds())
}
})
archiver.Start(repository.GetJobRepository())
taskManager.Start()
serverInit()
var wg sync.WaitGroup
server := http.Server{
ReadTimeout: 10 * time.Second,
WriteTimeout: 10 * time.Second,
Handler: handler,
Addr: config.Keys.Addr,
}
// Start http or https server
listener, err := net.Listen("tcp", config.Keys.Addr)
if err != nil {
log.Fatalf("starting http listener failed: %v", err)
}
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHttpTo != "" {
go func() {
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHttpTo, http.StatusMovedPermanently))
}()
}
if config.Keys.HttpsCertFile != "" && config.Keys.HttpsKeyFile != "" {
cert, err := tls.LoadX509KeyPair(config.Keys.HttpsCertFile, config.Keys.HttpsKeyFile)
if err != nil {
log.Fatalf("loading X509 keypair failed: %v", err)
}
listener = tls.NewListener(listener, &tls.Config{
Certificates: []tls.Certificate{cert},
CipherSuites: []uint16{
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
},
MinVersion: tls.VersionTLS12,
PreferServerCipherSuites: true,
})
fmt.Printf("HTTPS server listening at %s...", config.Keys.Addr)
} else {
fmt.Printf("HTTP server listening at %s...", config.Keys.Addr)
}
// Because this program will want to bind to a privileged port (like 80), the listener must
// be established first, then the user can be changed, and after that,
// the actual http server can be started.
if err = runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
log.Fatalf("error while preparing server start: %s", err.Error())
}
wg.Add(1)
go func() {
defer wg.Done()
if err = server.Serve(listener); err != nil && err != http.ErrServerClosed {
log.Fatalf("starting server failed: %v", err)
}
serverStart()
}()
wg.Add(1)
@ -547,117 +235,15 @@ func main() {
<-sigs
runtimeEnv.SystemdNotifiy(false, "Shutting down ...")
// First shut down the server gracefully (waiting for all ongoing requests)
server.Shutdown(context.Background())
serverShutdown()
// Then, wait for any async archivings still pending...
api.JobRepository.WaitForArchiving()
taskManager.Shutdown()
}()
s := gocron.NewScheduler(time.Local)
if config.Keys.StopJobsExceedingWalltime > 0 {
log.Info("Register undead jobs service")
s.Every(1).Day().At("3:00").Do(func() {
err = jobRepo.StopJobsExceedingWalltimeBy(config.Keys.StopJobsExceedingWalltime)
if err != nil {
log.Warnf("Error while looking for jobs exceeding their walltime: %s", err.Error())
}
runtime.GC()
})
}
var cfg struct {
Retention schema.Retention `json:"retention"`
Compression int `json:"compression"`
}
cfg.Retention.IncludeDB = true
if err = json.Unmarshal(config.Keys.Archive, &cfg); err != nil {
log.Warn("Error while unmarshaling raw config json")
}
switch cfg.Retention.Policy {
case "delete":
log.Info("Register retention delete service")
s.Every(1).Day().At("4:00").Do(func() {
startTime := time.Now().Unix() - int64(cfg.Retention.Age*24*3600)
jobs, err := jobRepo.FindJobsBetween(0, startTime)
if err != nil {
log.Warnf("Error while looking for retention jobs: %s", err.Error())
}
archive.GetHandle().CleanUp(jobs)
if cfg.Retention.IncludeDB {
cnt, err := jobRepo.DeleteJobsBefore(startTime)
if err != nil {
log.Errorf("Error while deleting retention jobs from db: %s", err.Error())
} else {
log.Infof("Retention: Removed %d jobs from db", cnt)
}
if err = jobRepo.Optimize(); err != nil {
log.Errorf("Error occured in db optimization: %s", err.Error())
}
}
})
case "move":
log.Info("Register retention move service")
s.Every(1).Day().At("4:00").Do(func() {
startTime := time.Now().Unix() - int64(cfg.Retention.Age*24*3600)
jobs, err := jobRepo.FindJobsBetween(0, startTime)
if err != nil {
log.Warnf("Error while looking for retention jobs: %s", err.Error())
}
archive.GetHandle().Move(jobs, cfg.Retention.Location)
if cfg.Retention.IncludeDB {
cnt, err := jobRepo.DeleteJobsBefore(startTime)
if err != nil {
log.Errorf("Error while deleting retention jobs from db: %v", err)
} else {
log.Infof("Retention: Removed %d jobs from db", cnt)
}
if err = jobRepo.Optimize(); err != nil {
log.Errorf("Error occured in db optimization: %v", err)
}
}
})
}
if cfg.Compression > 0 {
log.Info("Register compression service")
s.Every(1).Day().At("5:00").Do(func() {
var jobs []*schema.Job
ar := archive.GetHandle()
startTime := time.Now().Unix() - int64(cfg.Compression*24*3600)
lastTime := ar.CompressLast(startTime)
if startTime == lastTime {
log.Info("Compression Service - Complete archive run")
jobs, err = jobRepo.FindJobsBetween(0, startTime)
} else {
jobs, err = jobRepo.FindJobsBetween(lastTime, startTime)
}
if err != nil {
log.Warnf("Error while looking for compression jobs: %v", err)
}
ar.Compress(jobs)
})
}
s.StartAsync()
if os.Getenv("GOGC") == "" {
debug.SetGCPercent(25)
}
runtimeEnv.SystemdNotifiy(true, "running")
wg.Wait()
log.Print("Gracefull shutdown completed!")
log.Print("Graceful shutdown completed!")
}

330
cmd/cc-backend/server.go Normal file
View File

@ -0,0 +1,330 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"net/http"
"os"
"strings"
"time"
"github.com/99designs/gqlgen/graphql/handler"
"github.com/99designs/gqlgen/graphql/handler/transport"
"github.com/99designs/gqlgen/graphql/playground"
"github.com/ClusterCockpit/cc-backend/internal/api"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
"github.com/ClusterCockpit/cc-backend/web"
"github.com/gorilla/handlers"
"github.com/gorilla/mux"
httpSwagger "github.com/swaggo/http-swagger"
)
var (
router *mux.Router
server *http.Server
apiHandle *api.RestApi
)
func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusUnauthorized)
json.NewEncoder(rw).Encode(map[string]string{
"status": http.StatusText(http.StatusUnauthorized),
"error": err.Error(),
})
}
func serverInit() {
// Setup the http.Handler/Router used by the server
graph.Init()
resolver := graph.GetResolverInstance()
graphQLServer := handler.New(
generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
// graphQLServer.AddTransport(transport.SSE{})
graphQLServer.AddTransport(transport.POST{})
// graphQLServer.AddTransport(transport.Websocket{
// KeepAlivePingInterval: 10 * time.Second,
// Upgrader: websocket.Upgrader{
// CheckOrigin: func(r *http.Request) bool {
// return true
// },
// },
// })
if os.Getenv("DEBUG") != "1" {
// Having this handler means that a error message is returned via GraphQL instead of the connection simply beeing closed.
// The problem with this is that then, no more stacktrace is printed to stderr.
graphQLServer.SetRecoverFunc(func(ctx context.Context, err any) error {
switch e := err.(type) {
case string:
return fmt.Errorf("MAIN > Panic: %s", e)
case error:
return fmt.Errorf("MAIN > Panic caused by: %s", e.Error())
}
return errors.New("MAIN > Internal server error (panic)")
})
}
authHandle := auth.GetAuthInstance()
apiHandle = api.New()
router = mux.NewRouter()
buildInfo := web.Build{Version: version, Hash: commit, Buildtime: date}
info := map[string]any{}
info["hasOpenIDConnect"] = false
if config.Keys.OpenIDConfig != nil {
openIDConnect := auth.NewOIDC(authHandle)
openIDConnect.RegisterEndpoints(router)
info["hasOpenIDConnect"] = true
}
router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
log.Debugf("##%v##", info)
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
}).Methods(http.MethodGet)
router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
})
router.HandleFunc("/privacy", func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
})
secured := router.PathPrefix("/").Subrouter()
securedapi := router.PathPrefix("/api").Subrouter()
userapi := router.PathPrefix("/userapi").Subrouter()
configapi := router.PathPrefix("/config").Subrouter()
frontendapi := router.PathPrefix("/frontend").Subrouter()
if !config.Keys.DisableAuthentication {
router.Handle("/login", authHandle.Login(
// On success: Handled within Login()
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Login failed - ClusterCockpit",
MsgType: "alert-warning",
Message: err.Error(),
Build: buildInfo,
Infos: info,
})
})).Methods(http.MethodPost)
router.Handle("/jwt-login", authHandle.Login(
// On success: Handled within Login()
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Login failed - ClusterCockpit",
MsgType: "alert-warning",
Message: err.Error(),
Build: buildInfo,
Infos: info,
})
}))
router.Handle("/logout", authHandle.Logout(
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
rw.WriteHeader(http.StatusOK)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Bye - ClusterCockpit",
MsgType: "alert-info",
Message: "Logout successful",
Build: buildInfo,
Infos: info,
})
}))).Methods(http.MethodPost)
secured.Use(func(next http.Handler) http.Handler {
return authHandle.Auth(
// On success;
next,
// On failure:
func(rw http.ResponseWriter, r *http.Request, err error) {
rw.WriteHeader(http.StatusUnauthorized)
web.RenderTemplate(rw, "login.tmpl", &web.Page{
Title: "Authentication failed - ClusterCockpit",
MsgType: "alert-danger",
Message: err.Error(),
Build: buildInfo,
Infos: info,
Redirect: r.RequestURI,
})
})
})
securedapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthApi(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
userapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthUserApi(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
configapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthConfigApi(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
frontendapi.Use(func(next http.Handler) http.Handler {
return authHandle.AuthFrontendApi(
// On success;
next,
// On failure: JSON Response
onFailureResponse)
})
}
if flagDev {
router.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
router.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
}
secured.Handle("/query", graphQLServer)
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
routerConfig.HandleSearchBar(rw, r, buildInfo)
})
// Mount all /monitoring/... and /api/... routes.
routerConfig.SetupRoutes(secured, buildInfo)
apiHandle.MountApiRoutes(securedapi)
apiHandle.MountUserApiRoutes(userapi)
apiHandle.MountConfigApiRoutes(configapi)
apiHandle.MountFrontendApiRoutes(frontendapi)
if config.Keys.EmbedStaticFiles {
if i, err := os.Stat("./var/img"); err == nil {
if i.IsDir() {
log.Info("Use local directory for static images")
router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
}
}
router.PathPrefix("/").Handler(web.ServeFiles())
} else {
router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
}
router.Use(handlers.CompressHandler)
router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
router.Use(handlers.CORS(
handlers.AllowCredentials(),
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
handlers.AllowedOrigins([]string{"*"})))
}
func serverStart() {
handler := handlers.CustomLoggingHandler(io.Discard, router, func(_ io.Writer, params handlers.LogFormatterParams) {
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
log.Debugf("%s %s (%d, %.02fkb, %dms)",
params.Request.Method, params.URL.RequestURI(),
params.StatusCode, float32(params.Size)/1024,
time.Since(params.TimeStamp).Milliseconds())
} else {
log.Debugf("%s %s (%d, %.02fkb, %dms)",
params.Request.Method, params.URL.RequestURI(),
params.StatusCode, float32(params.Size)/1024,
time.Since(params.TimeStamp).Milliseconds())
}
})
server = &http.Server{
ReadTimeout: 20 * time.Second,
WriteTimeout: 20 * time.Second,
Handler: handler,
Addr: config.Keys.Addr,
}
// Start http or https server
listener, err := net.Listen("tcp", config.Keys.Addr)
if err != nil {
log.Abortf("Server Start: Starting http listener on '%s' failed.\nError: %s\n", config.Keys.Addr, err.Error())
}
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHttpTo != "" {
go func() {
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHttpTo, http.StatusMovedPermanently))
}()
}
if config.Keys.HttpsCertFile != "" && config.Keys.HttpsKeyFile != "" {
cert, err := tls.LoadX509KeyPair(
config.Keys.HttpsCertFile, config.Keys.HttpsKeyFile)
if err != nil {
log.Abortf("Server Start: Loading X509 keypair failed. Check options 'https-cert-file' and 'https-key-file' in 'config.json'.\nError: %s\n", err.Error())
}
listener = tls.NewListener(listener, &tls.Config{
Certificates: []tls.Certificate{cert},
CipherSuites: []uint16{
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
},
MinVersion: tls.VersionTLS12,
PreferServerCipherSuites: true,
})
log.Printf("HTTPS server listening at %s...\n", config.Keys.Addr)
} else {
log.Printf("HTTP server listening at %s...\n", config.Keys.Addr)
}
//
// Because this program will want to bind to a privileged port (like 80), the listener must
// be established first, then the user can be changed, and after that,
// the actual http server can be started.
if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
log.Abortf("Server Start: Error while preparing server start.\nError: %s\n", err.Error())
}
if err = server.Serve(listener); err != nil && err != http.ErrServerClosed {
log.Abortf("Server Start: Starting server failed.\nError: %s\n", err.Error())
}
}
func serverShutdown() {
// First shut down the server gracefully (waiting for all ongoing requests)
server.Shutdown(context.Background())
// Then, wait for any async archivings still pending...
archiver.WaitForArchiving()
}

View File

@ -1,56 +1,70 @@
{
"addr": "127.0.0.1:8080",
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"jwts": {
"max-age": "2000h"
},
"clusters": [
{
"name": "fritz",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "alex",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}
"addr": "127.0.0.1:8080",
"short-running-jobs-duration": 300,
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"jwts": {
"max-age": "2000h"
},
"enable-resampling": {
"trigger": 30,
"resolutions": [
600,
300,
120,
60
]
},
"apiAllowedIPs": [
"*"
],
"emission-constant": 317,
"clusters": [
{
"name": "fritz",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "alex",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}
]
}

View File

@ -0,0 +1,69 @@
{
"addr": "127.0.0.1:8080",
"short-running-jobs-duration": 300,
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"jwts": {
"max-age": "2000h"
},
"db-driver": "mysql",
"db": "clustercockpit:demo@tcp(127.0.0.1:3306)/clustercockpit",
"enable-resampling": {
"trigger": 30,
"resolutions": [
600,
300,
120,
60
]
},
"emission-constant": 317,
"clusters": [
{
"name": "fritz",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "alex",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": ""
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}
]
}

View File

@ -1,50 +1,62 @@
{
"addr": "0.0.0.0:443",
"ldap": {
"url": "ldaps://test",
"user_base": "ou=people,ou=hpc,dc=test,dc=de",
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=test,dc=de",
"user_bind": "uid={username},ou=people,ou=hpc,dc=test,dc=de",
"user_filter": "(&(objectclass=posixAccount))"
},
"https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem",
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
"user": "clustercockpit",
"group": "clustercockpit",
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"validate": true,
"clusters": [
{
"name": "test",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "eyJhbGciOiJF-E-pQBQ"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
"addr": "0.0.0.0:443",
"ldap": {
"url": "ldaps://test",
"user_base": "ou=people,ou=hpc,dc=test,dc=de",
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=test,dc=de",
"user_bind": "uid={username},ou=people,ou=hpc,dc=test,dc=de",
"user_filter": "(&(objectclass=posixAccount))"
},
"https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem",
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
"user": "clustercockpit",
"group": "clustercockpit",
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"validate": false,
"apiAllowedIPs": [
"*"
],
"clusters": [
{
"name": "test",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "eyJhbGciOiJF-E-pQBQ"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
],
"jwts": {
"cookieName": "",
"validateUser": false,
"max-age": "2000h",
"trustedIssuer": ""
},
"short-running-jobs-duration": 300
}
}
],
"jwts": {
"cookieName": "",
"validateUser": false,
"max-age": "2000h",
"trustedIssuer": ""
},
"enable-resampling": {
"trigger": 30,
"resolutions": [
600,
300,
120,
60
]
},
"short-running-jobs-duration": 300
}

View File

@ -0,0 +1,12 @@
{
"clusters": [
{
"name": "fritz",
"default_metrics": "cpu_load, flops_any, core_power, lustre_open, mem_used, mem_bw, net_bytes_in"
},
{
"name": "alex",
"default_metrics": "flops_any, mem_bw, mem_used, vectorization_ratio"
}
]
}

View File

@ -117,10 +117,12 @@ foreach my $ln (split("\n", $topo)) {
my $node;
my @sockets;
my @nodeCores;
foreach my $socket ( @{$DOMAINS{socket}} ) {
push @sockets, "[".join(",", @{$socket})."]";
$node .= join(",", @{$socket})
push @nodeCores, join(",", @{$socket});
}
$node = join(",", @nodeCores);
$INFO{sockets} = join(",\n", @sockets);
my @memDomains;
@ -212,9 +214,27 @@ print <<"END";
"socketsPerNode": $INFO{socketsPerNode},
"coresPerSocket": $INFO{coresPerSocket},
"threadsPerCore": $INFO{threadsPerCore},
"flopRateScalar": $flopsScalar,
"flopRateSimd": $flopsSimd,
"memoryBandwidth": $memBw,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": $flopsScalar
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": $flopsSimd
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": $memBw
},
"nodes": "<FILL IN NODE RANGES>",
"topology": {
"node": [$node],

113
go.mod
View File

@ -1,91 +1,92 @@
module github.com/ClusterCockpit/cc-backend
go 1.18
go 1.23.5
toolchain go1.24.1
require (
github.com/99designs/gqlgen v0.17.45
github.com/99designs/gqlgen v0.17.66
github.com/ClusterCockpit/cc-units v0.4.0
github.com/Masterminds/squirrel v1.5.3
github.com/coreos/go-oidc/v3 v3.9.0
github.com/go-co-op/gocron v1.25.0
github.com/go-ldap/ldap/v3 v3.4.4
github.com/go-sql-driver/mysql v1.7.0
github.com/golang-jwt/jwt/v5 v5.2.1
github.com/golang-migrate/migrate/v4 v4.15.2
github.com/google/gops v0.3.27
github.com/gorilla/handlers v1.5.1
github.com/gorilla/mux v1.8.0
github.com/gorilla/sessions v1.2.1
github.com/influxdata/influxdb-client-go/v2 v2.12.2
github.com/jmoiron/sqlx v1.3.5
github.com/mattn/go-sqlite3 v1.14.16
github.com/prometheus/client_golang v1.14.0
github.com/prometheus/common v0.40.0
github.com/Masterminds/squirrel v1.5.4
github.com/coreos/go-oidc/v3 v3.12.0
github.com/go-co-op/gocron/v2 v2.16.0
github.com/go-ldap/ldap/v3 v3.4.10
github.com/go-sql-driver/mysql v1.9.0
github.com/golang-jwt/jwt/v5 v5.2.2
github.com/golang-migrate/migrate/v4 v4.18.2
github.com/google/gops v0.3.28
github.com/gorilla/handlers v1.5.2
github.com/gorilla/mux v1.8.1
github.com/gorilla/sessions v1.4.0
github.com/influxdata/influxdb-client-go/v2 v2.14.0
github.com/jmoiron/sqlx v1.4.0
github.com/mattn/go-sqlite3 v1.14.24
github.com/prometheus/client_golang v1.21.0
github.com/prometheus/common v0.62.0
github.com/qustavo/sqlhooks/v2 v2.1.0
github.com/santhosh-tekuri/jsonschema/v5 v5.2.0
github.com/swaggo/http-swagger v1.3.3
github.com/swaggo/swag v1.16.3
github.com/vektah/gqlparser/v2 v2.5.11
golang.org/x/crypto v0.21.0
golang.org/x/exp v0.0.0-20230510235704-dd950f8aeaea
golang.org/x/oauth2 v0.13.0
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
github.com/swaggo/http-swagger v1.3.4
github.com/swaggo/swag v1.16.4
github.com/vektah/gqlparser/v2 v2.5.22
golang.org/x/crypto v0.35.0
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
golang.org/x/oauth2 v0.27.0
golang.org/x/time v0.5.0
)
require (
filippo.io/edwards25519 v1.1.0 // indirect
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
github.com/KyleBanks/depth v1.2.1 // indirect
github.com/agnivade/levenshtein v1.1.1 // indirect
github.com/agnivade/levenshtein v1.2.1 // indirect
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/containerd/containerd v1.6.26 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
github.com/deepmap/oapi-codegen v1.12.4 // indirect
github.com/felixge/httpsnoop v1.0.3 // indirect
github.com/go-asn1-ber/asn1-ber v1.5.4 // indirect
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-asn1-ber/asn1-ber v1.5.7 // indirect
github.com/go-jose/go-jose/v4 v4.0.5 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/spec v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/go-viper/mapstructure/v2 v2.2.1 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/securecookie v1.1.1 // indirect
github.com/gorilla/websocket v1.5.0 // indirect
github.com/gorilla/securecookie v1.1.2 // indirect
github.com/gorilla/websocket v1.5.3 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
github.com/joho/godotenv v1.5.1 // indirect
github.com/jonboulle/clockwork v0.5.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/opencontainers/image-spec v1.1.0-rc2.0.20221005185240-3a7f492d3f1b // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/procfs v0.9.0 // indirect
github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sosodev/duration v1.2.0 // indirect
github.com/swaggo/files v1.0.0 // indirect
github.com/urfave/cli/v2 v2.27.1 // indirect
github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913 // indirect
go.uber.org/atomic v1.10.0 // indirect
golang.org/x/mod v0.16.0 // indirect
golang.org/x/net v0.22.0 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/tools v0.19.0 // indirect
google.golang.org/appengine v1.6.8 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 // indirect
google.golang.org/protobuf v1.33.0 // indirect
github.com/sosodev/duration v1.3.1 // indirect
github.com/swaggo/files v1.0.1 // indirect
github.com/urfave/cli/v2 v2.27.5 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
go.uber.org/atomic v1.11.0 // indirect
golang.org/x/mod v0.23.0 // indirect
golang.org/x/net v0.36.0 // indirect
golang.org/x/sync v0.11.0 // indirect
golang.org/x/sys v0.30.0 // indirect
golang.org/x/text v0.22.0 // indirect
golang.org/x/tools v0.30.0 // indirect
google.golang.org/protobuf v1.36.5 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect

2078
go.sum

File diff suppressed because it is too large Load Diff

View File

@ -30,6 +30,7 @@ resolver:
# gqlgen will search for any type names in the schema in these go packages
# if they match it will use them, otherwise it will generate them.
autobind:
- "github.com/99designs/gqlgen/graphql/introspection"
- "github.com/ClusterCockpit/cc-backend/internal/graph/model"
# This section declares type mapping between the GraphQL and go type systems
@ -61,23 +62,50 @@ models:
fields:
partitions:
resolver: true
NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
MetricValue: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
NullableFloat:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
MetricScope:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
MetricValue:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
JobStatistics:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
GlobalMetricListItem:
{
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.GlobalMetricListItem",
}
ClusterSupport:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.ClusterSupport" }
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
JobState: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
TimeRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
IntRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.IntRange" }
JobMetric: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobMetric" }
Resource:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
JobState:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
TimeRange:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
IntRange:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.IntRange" }
JobMetric:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Series" }
MetricStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricStatistics" }
MetricConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricConfig" }
SubClusterConfig: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubClusterConfig" }
Accelerator: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Accelerator" }
Topology: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Topology" }
FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
MetricStatistics:
{
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricStatistics",
}
MetricConfig:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricConfig" }
SubClusterConfig:
{
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubClusterConfig",
}
Accelerator:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Accelerator" }
Topology:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Topology" }
FilterRanges:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
SubCluster:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
StatsSeries:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }

View File

@ -1,5 +1,5 @@
[Unit]
Description=ClusterCockpit Web Server (Go edition)
Description=ClusterCockpit Web Server
Documentation=https://github.com/ClusterCockpit/cc-backend
Wants=network-online.target
After=network-online.target

View File

@ -14,13 +14,16 @@ import (
"os"
"path/filepath"
"reflect"
"strconv"
"strings"
"testing"
"time"
"github.com/ClusterCockpit/cc-backend/internal/api"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
@ -42,6 +45,9 @@ func setup(t *testing.T) *api.RestApi {
"jwts": {
"max-age": "2m"
},
"apiAllowedIPs": [
"*"
],
"clusters": [
{
"name": "testcluster",
@ -117,7 +123,7 @@ func setup(t *testing.T) *api.RestApi {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil {
t.Fatal(err)
}
@ -144,23 +150,20 @@ func setup(t *testing.T) *api.RestApi {
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
repository.Connect("sqlite3", dbfilepath)
db := repository.GetConnection()
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
t.Fatal(err)
}
if err := metricdata.Init(config.Keys.DisableArchive); err != nil {
if err := metricdata.Init(); err != nil {
t.Fatal(err)
}
jobRepo := repository.GetJobRepository()
resolver := &graph.Resolver{DB: db.DB, Repo: jobRepo}
archiver.Start(repository.GetJobRepository())
auth.Init()
graph.Init()
return &api.RestApi{
JobRepository: resolver.Repo,
Resolver: resolver,
}
return api.New()
}
func cleanup() {
@ -175,7 +178,6 @@ func cleanup() {
func TestRestApi(t *testing.T) {
restapi := setup(t)
t.Cleanup(cleanup)
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
@ -192,12 +194,18 @@ func TestRestApi(t *testing.T) {
},
}
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
r := mux.NewRouter()
restapi.MountRoutes(r)
r.PathPrefix("/api").Subrouter()
r.StrictSlash(true)
restapi.MountApiRoutes(r)
var TestJobId int64 = 123
var TestClusterName string = "testcluster"
var TestStartTime int64 = 123456789
const startJobBody string = `{
"jobId": 123,
@ -213,7 +221,7 @@ func TestRestApi(t *testing.T) {
"exclusive": 1,
"monitoringStatus": 1,
"smt": 1,
"tags": [{ "type": "testTagType", "name": "testTagName" }],
"tags": [{ "type": "testTagType", "name": "testTagName", "scope": "testuser" }],
"resources": [
{
"hostname": "host123",
@ -224,28 +232,33 @@ func TestRestApi(t *testing.T) {
"startTime": 123456789
}`
var dbid int64
const contextUserKey repository.ContextKey = "user"
contextUserValue := &schema.User{
Username: "testuser",
Projects: make([]string, 0),
Roles: []string{"user"},
AuthType: 0,
AuthSource: 2,
}
if ok := t.Run("StartJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusCreated {
t.Fatal(response.Status, recorder.Body.String())
}
var res api.StartJobApiResponse
if err := json.Unmarshal(recorder.Body.Bytes(), &res); err != nil {
t.Fatal(err)
}
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(res.DBID)))
resolver := graph.GetResolverInstance()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
job.Tags, err = restapi.Resolver.Job().Tags(context.Background(), job)
job.Tags, err = resolver.Job().Tags(ctx, job)
if err != nil {
t.Fatal(err)
}
@ -269,11 +282,9 @@ func TestRestApi(t *testing.T) {
t.Fatalf("unexpected job properties: %#v", job)
}
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" {
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" {
t.Fatalf("unexpected tags: %#v", job.Tags)
}
dbid = res.DBID
}); !ok {
return
}
@ -289,17 +300,19 @@ func TestRestApi(t *testing.T) {
var stoppedJob *schema.Job
if ok := t.Run("StopJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusOK {
t.Fatal(response.Status, recorder.Body.String())
}
restapi.JobRepository.WaitForArchiving()
job, err := restapi.Resolver.Query().Job(context.Background(), strconv.Itoa(int(dbid)))
archiver.WaitForArchiving()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
@ -327,7 +340,7 @@ func TestRestApi(t *testing.T) {
}
t.Run("CheckArchive", func(t *testing.T) {
data, err := metricdata.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background())
data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
if err != nil {
t.Fatal(err)
}
@ -341,10 +354,12 @@ func TestRestApi(t *testing.T) {
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!
body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1)
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(body)))
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusUnprocessableEntity {
t.Fatal(response.Status, recorder.Body.String())
@ -371,10 +386,12 @@ func TestRestApi(t *testing.T) {
}`
ok := t.Run("StartJobFailed", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/api/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBodyFailed)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusCreated {
t.Fatal(response.Status, recorder.Body.String())
@ -384,8 +401,10 @@ func TestRestApi(t *testing.T) {
t.Fatal("subtest failed")
}
time.Sleep(1 * time.Second)
const stopJobBodyFailed string = `{
"jobId": 12345,
"jobId": 12345,
"cluster": "testcluster",
"jobState": "failed",
@ -393,16 +412,18 @@ func TestRestApi(t *testing.T) {
}`
ok = t.Run("StopJobFailed", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/api/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBodyFailed)))
recorder := httptest.NewRecorder()
r.ServeHTTP(recorder, req)
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusOK {
t.Fatal(response.Status, recorder.Body.String())
}
restapi.JobRepository.WaitForArchiving()
archiver.WaitForArchiving()
jobid, cluster := int64(12345), "testcluster"
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
if err != nil {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,94 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package archiver
import (
"context"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
)
var (
archivePending sync.WaitGroup
archiveChannel chan *schema.Job
jobRepo *repository.JobRepository
)
func Start(r *repository.JobRepository) {
archiveChannel = make(chan *schema.Job, 128)
jobRepo = r
go archivingWorker()
}
// Archiving worker thread
func archivingWorker() {
for {
select {
case job, ok := <-archiveChannel:
if !ok {
break
}
start := time.Now()
// not using meta data, called to load JobMeta into Cache?
// will fail if job meta not in repository
if _, err := jobRepo.FetchMetadata(job); err != nil {
log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
continue
}
// ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
// TODO: Maybe use context with cancel/timeout here
jobMeta, err := ArchiveJob(job, context.Background())
if err != nil {
log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
continue
}
stmt := sq.Update("job").Where("job.id = ?", job.ID)
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
log.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
continue
}
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
log.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
continue
}
// Update the jobs database entry one last time:
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
if err := jobRepo.Execute(stmt); err != nil {
log.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
continue
}
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
log.Printf("archiving job (dbid: %d) successful", job.ID)
archivePending.Done()
}
}
}
// Trigger async archiving
func TriggerArchiving(job *schema.Job) {
if archiveChannel == nil {
log.Fatal("Cannot archive without archiving channel. Did you Start the archiver?")
}
archivePending.Add(1)
archiveChannel <- job
}
// Wait for background thread to finish pending archiving operations
func WaitForArchiving() {
// close channel and wait for worker to process remaining jobs
archivePending.Wait()
}

View File

@ -0,0 +1,83 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package archiver
import (
"context"
"math"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
// Writes a running job to the job-archive
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
allMetrics := make([]string, 0)
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name)
}
scopes := []schema.MetricScope{schema.MetricScopeNode}
// FIXME: Add a config option for this
if job.NumNodes <= 8 {
// This will add the native scope if core scope is not available
scopes = append(scopes, schema.MetricScopeCore)
}
if job.NumAcc > 0 {
scopes = append(scopes, schema.MetricScopeAccelerator)
}
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
if err != nil {
log.Error("Error wile loading job data for archiving")
return nil, err
}
jobMeta := &schema.JobMeta{
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Statistics: make(map[string]schema.JobStatistics),
}
for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
nodeData, ok := data["node"]
if !ok {
// This should never happen ?
continue
}
for _, series := range nodeData.Series {
avg += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
}
// Round AVG Result to 2 Digits
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
},
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
Min: min,
Max: max,
}
}
// If the file based archive is disabled,
// only return the JobMeta structure as the
// statistics in there are needed.
if config.Keys.DisableArchive {
return jobMeta, nil
}
return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData)
}

View File

@ -10,12 +10,19 @@ import (
"database/sql"
"encoding/base64"
"errors"
"fmt"
"net"
"net/http"
"os"
"strings"
"sync"
"time"
"golang.org/x/time/rate"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/util"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/gorilla/sessions"
@ -26,6 +33,24 @@ type Authenticator interface {
Login(user *schema.User, rw http.ResponseWriter, r *http.Request) (*schema.User, error)
}
var (
initOnce sync.Once
authInstance *Authentication
)
var ipUserLimiters sync.Map
func getIPUserLimiter(ip, username string) *rate.Limiter {
key := ip + ":" + username
limiter, ok := ipUserLimiters.Load(key)
if !ok {
newLimiter := rate.NewLimiter(rate.Every(time.Hour/10), 10)
ipUserLimiters.Store(key, newLimiter)
return newLimiter
}
return limiter.(*rate.Limiter)
}
type Authentication struct {
sessionStore *sessions.CookieStore
LdapAuth *LdapAuthenticator
@ -62,82 +87,111 @@ func (auth *Authentication) AuthViaSession(
}, nil
}
func Init() (*Authentication, error) {
auth := &Authentication{}
func Init() {
initOnce.Do(func() {
authInstance = &Authentication{}
sessKey := os.Getenv("SESSION_KEY")
if sessKey == "" {
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
bytes := make([]byte, 32)
if _, err := rand.Read(bytes); err != nil {
log.Error("Error while initializing authentication -> failed to generate random bytes for session key")
return nil, err
}
auth.sessionStore = sessions.NewCookieStore(bytes)
} else {
bytes, err := base64.StdEncoding.DecodeString(sessKey)
if err != nil {
log.Error("Error while initializing authentication -> decoding session key failed")
return nil, err
}
auth.sessionStore = sessions.NewCookieStore(bytes)
}
if config.Keys.LdapConfig != nil {
ldapAuth := &LdapAuthenticator{}
if err := ldapAuth.Init(); err != nil {
log.Warn("Error while initializing authentication -> ldapAuth init failed")
sessKey := os.Getenv("SESSION_KEY")
if sessKey == "" {
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
bytes := make([]byte, 32)
if _, err := rand.Read(bytes); err != nil {
log.Fatal("Error while initializing authentication -> failed to generate random bytes for session key")
}
authInstance.sessionStore = sessions.NewCookieStore(bytes)
} else {
auth.LdapAuth = ldapAuth
auth.authenticators = append(auth.authenticators, auth.LdapAuth)
}
} else {
log.Info("Missing LDAP configuration: No LDAP support!")
}
if config.Keys.JwtConfig != nil {
auth.JwtAuth = &JWTAuthenticator{}
if err := auth.JwtAuth.Init(); err != nil {
log.Error("Error while initializing authentication -> jwtAuth init failed")
return nil, err
bytes, err := base64.StdEncoding.DecodeString(sessKey)
if err != nil {
log.Fatal("Error while initializing authentication -> decoding session key failed")
}
authInstance.sessionStore = sessions.NewCookieStore(bytes)
}
jwtSessionAuth := &JWTSessionAuthenticator{}
if err := jwtSessionAuth.Init(); err != nil {
log.Info("jwtSessionAuth init failed: No JWT login support!")
if d, err := time.ParseDuration(config.Keys.SessionMaxAge); err == nil {
authInstance.SessionMaxAge = d
}
if config.Keys.LdapConfig != nil {
ldapAuth := &LdapAuthenticator{}
if err := ldapAuth.Init(); err != nil {
log.Warn("Error while initializing authentication -> ldapAuth init failed")
} else {
authInstance.LdapAuth = ldapAuth
authInstance.authenticators = append(authInstance.authenticators, authInstance.LdapAuth)
}
} else {
auth.authenticators = append(auth.authenticators, jwtSessionAuth)
log.Info("Missing LDAP configuration: No LDAP support!")
}
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
if err := jwtCookieSessionAuth.Init(); err != nil {
log.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
if config.Keys.JwtConfig != nil {
authInstance.JwtAuth = &JWTAuthenticator{}
if err := authInstance.JwtAuth.Init(); err != nil {
log.Fatal("Error while initializing authentication -> jwtAuth init failed")
}
jwtSessionAuth := &JWTSessionAuthenticator{}
if err := jwtSessionAuth.Init(); err != nil {
log.Info("jwtSessionAuth init failed: No JWT login support!")
} else {
authInstance.authenticators = append(authInstance.authenticators, jwtSessionAuth)
}
jwtCookieSessionAuth := &JWTCookieSessionAuthenticator{}
if err := jwtCookieSessionAuth.Init(); err != nil {
log.Info("jwtCookieSessionAuth init failed: No JWT cookie login support!")
} else {
authInstance.authenticators = append(authInstance.authenticators, jwtCookieSessionAuth)
}
} else {
auth.authenticators = append(auth.authenticators, jwtCookieSessionAuth)
log.Info("Missing JWT configuration: No JWT token support!")
}
} else {
log.Info("Missing JWT configuration: No JWT token support!")
}
auth.LocalAuth = &LocalAuthenticator{}
if err := auth.LocalAuth.Init(); err != nil {
log.Error("Error while initializing authentication -> localAuth init failed")
return nil, err
}
auth.authenticators = append(auth.authenticators, auth.LocalAuth)
return auth, nil
authInstance.LocalAuth = &LocalAuthenticator{}
if err := authInstance.LocalAuth.Init(); err != nil {
log.Fatal("Error while initializing authentication -> localAuth init failed")
}
authInstance.authenticators = append(authInstance.authenticators, authInstance.LocalAuth)
})
}
func persistUser(user *schema.User) {
func GetAuthInstance() *Authentication {
if authInstance == nil {
log.Fatal("Authentication module not initialized!")
}
return authInstance
}
func handleTokenUser(tokenUser *schema.User) {
r := repository.GetUserRepository()
_, err := r.GetUser(user.Username)
dbUser, err := r.GetUser(tokenUser.Username)
if err != nil && err != sql.ErrNoRows {
log.Errorf("Error while loading user '%s': %v", user.Username, err)
} else if err == sql.ErrNoRows {
if err := r.AddUser(user); err != nil {
log.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
log.Errorf("Error while loading user '%s': %v", tokenUser.Username, err)
} else if err == sql.ErrNoRows && config.Keys.JwtConfig.SyncUserOnLogin { // Adds New User
if err := r.AddUser(tokenUser); err != nil {
log.Errorf("Error while adding user '%s' to DB: %v", tokenUser.Username, err)
}
} else if err == nil && config.Keys.JwtConfig.UpdateUserOnLogin { // Update Existing User
if err := r.UpdateUser(dbUser, tokenUser); err != nil {
log.Errorf("Error while updating user '%s' to DB: %v", dbUser.Username, err)
}
}
}
func handleOIDCUser(OIDCUser *schema.User) {
r := repository.GetUserRepository()
dbUser, err := r.GetUser(OIDCUser.Username)
if err != nil && err != sql.ErrNoRows {
log.Errorf("Error while loading user '%s': %v", OIDCUser.Username, err)
} else if err == sql.ErrNoRows && config.Keys.OpenIDConfig.SyncUserOnLogin { // Adds New User
if err := r.AddUser(OIDCUser); err != nil {
log.Errorf("Error while adding user '%s' to DB: %v", OIDCUser.Username, err)
}
} else if err == nil && config.Keys.OpenIDConfig.UpdateUserOnLogin { // Update Existing User
if err := r.UpdateUser(dbUser, OIDCUser); err != nil {
log.Errorf("Error while updating user '%s' to DB: %v", dbUser.Username, err)
}
}
}
@ -153,6 +207,10 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
if auth.SessionMaxAge != 0 {
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
}
if config.Keys.HttpsCertFile == "" && config.Keys.HttpsKeyFile == "" {
session.Options.Secure = false
}
session.Options.SameSite = http.SameSiteStrictMode
session.Values["username"] = user.Username
session.Values["projects"] = user.Projects
session.Values["roles"] = user.Roles
@ -166,13 +224,24 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
}
func (auth *Authentication) Login(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
username := r.FormValue("username")
var dbUser *schema.User
ip, _, err := net.SplitHostPort(r.RemoteAddr)
if err != nil {
ip = r.RemoteAddr
}
username := r.FormValue("username")
limiter := getIPUserLimiter(ip, username)
if !limiter.Allow() {
log.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username)
onfailure(rw, r, errors.New("Too many login attempts, try again in a few minutes."))
return
}
var dbUser *schema.User
if username != "" {
var err error
dbUser, err = repository.GetUserRepository().GetUser(username)
@ -203,7 +272,13 @@ func (auth *Authentication) Login(
log.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
if r.FormValue("redirect") != "" {
http.RedirectHandler(r.FormValue("redirect"), http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
return
}
http.RedirectHandler("/", http.StatusFound).ServeHTTP(rw, r.WithContext(ctx))
return
}
@ -219,31 +294,150 @@ func (auth *Authentication) Auth(
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
log.Infof("authentication failed: %s", err.Error())
log.Infof("auth -> authentication failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
if user == nil {
user, err = auth.AuthViaSession(rw, r)
if err != nil {
log.Infof("authentication failed: %s", err.Error())
log.Infof("auth -> authentication failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
}
if user != nil {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
log.Debug("authentication failed")
log.Info("auth -> authentication failed")
onfailure(rw, r, errors.New("unauthorized (please login first)"))
})
}
func (auth *Authentication) AuthApi(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
log.Infof("auth api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
ipErr := securedCheck(user, r)
if ipErr != nil {
log.Infof("auth api -> secured check failed: %s", ipErr.Error())
onfailure(rw, r, ipErr)
return
}
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
default:
log.Info("auth api -> authentication failed: missing role")
onfailure(rw, r, errors.New("unauthorized"))
}
}
log.Info("auth api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthUserApi(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.JwtAuth.AuthViaJWT(rw, r)
if err != nil {
log.Infof("auth user api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
default:
log.Info("auth user api -> authentication failed: missing role")
onfailure(rw, r, errors.New("unauthorized"))
}
}
log.Info("auth user api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthConfigApi(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.AuthViaSession(rw, r)
if err != nil {
log.Infof("auth config api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
if user != nil && user.HasRole(schema.RoleAdmin) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
log.Info("auth config api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) AuthFrontendApi(
onsuccess http.Handler,
onfailure func(rw http.ResponseWriter, r *http.Request, authErr error),
) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.AuthViaSession(rw, r)
if err != nil {
log.Infof("auth frontend api -> authentication failed: %s", err.Error())
onfailure(rw, r, err)
return
}
if user != nil {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
log.Info("auth frontend api -> authentication failed: no auth")
onfailure(rw, r, errors.New("unauthorized"))
})
}
func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
session, err := auth.sessionStore.Get(r, "session")
@ -263,3 +457,38 @@ func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
onsuccess.ServeHTTP(rw, r)
})
}
// Helper Moved To MiddleWare Auth Handlers
func securedCheck(user *schema.User, r *http.Request) error {
if user == nil {
return fmt.Errorf("no user for secured check")
}
// extract IP address for checking
IPAddress := r.Header.Get("X-Real-Ip")
if IPAddress == "" {
IPAddress = r.Header.Get("X-Forwarded-For")
}
if IPAddress == "" {
IPAddress = r.RemoteAddr
}
if strings.Contains(IPAddress, ":") {
IPAddress = strings.Split(IPAddress, ":")[0]
}
// If nothing declared in config: deny all request to this api endpoint
if len(config.Keys.ApiAllowedIPs) == 0 {
return fmt.Errorf("missing configuration key ApiAllowedIPs")
}
// If wildcard declared in config: Continue
if config.Keys.ApiAllowedIPs[0] == "*" {
return nil
}
// check if IP is allowed
if !util.Contains(config.Keys.ApiAllowedIPs, IPAddress) {
return fmt.Errorf("unknown ip: %v", IPAddress)
}
return nil
}

View File

@ -198,8 +198,8 @@ func (ja *JWTCookieSessionAuthenticator) Login(
AuthSource: schema.AuthViaToken,
}
if jc.SyncUserOnLogin {
persistUser(user)
if jc.SyncUserOnLogin || jc.UpdateUserOnLogin {
handleTokenUser(user)
}
}

View File

@ -138,8 +138,8 @@ func (ja *JWTSessionAuthenticator) Login(
AuthSource: schema.AuthViaToken,
}
if config.Keys.JwtConfig.SyncUserOnLogin {
persistUser(user)
if config.Keys.JwtConfig.SyncUserOnLogin || config.Keys.JwtConfig.UpdateUserOnLogin {
handleTokenUser(user)
}
}

View File

@ -10,7 +10,6 @@ import (
"net/http"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
@ -34,33 +33,6 @@ func (la *LdapAuthenticator) Init() error {
lc := config.Keys.LdapConfig
if lc.SyncInterval != "" {
interval, err := time.ParseDuration(lc.SyncInterval)
if err != nil {
log.Warnf("Could not parse duration for sync interval: %v",
lc.SyncInterval)
return err
}
if interval == 0 {
log.Info("Sync interval is zero")
return nil
}
go func() {
ticker := time.NewTicker(interval)
for t := range ticker.C {
log.Printf("sync started at %s", t.Format(time.RFC3339))
if err := la.Sync(); err != nil {
log.Errorf("sync failed: %s", err.Error())
}
log.Print("sync done")
}
}()
} else {
log.Info("LDAP configuration key sync_interval invalid")
}
if lc.UserAttr != "" {
la.UserAttr = lc.UserAttr
} else {

View File

@ -168,8 +168,8 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
AuthSource: schema.AuthViaOIDC,
}
if config.Keys.OpenIDConfig.SyncUserOnLogin {
persistUser(user)
if config.Keys.OpenIDConfig.SyncUserOnLogin || config.Keys.OpenIDConfig.UpdateUserOnLogin {
handleOIDCUser(user)
}
oa.authentication.SaveSession(rw, r, user)

View File

@ -7,9 +7,9 @@ package config
import (
"bytes"
"encoding/json"
"log"
"os"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
@ -29,10 +29,9 @@ var Keys schema.ProgramConfig = schema.ProgramConfig{
"analysis_view_histogramMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"analysis_view_scatterPlotMetrics": [][]string{{"flops_any", "mem_bw"}, {"flops_any", "cpu_load"}, {"cpu_load", "mem_bw"}},
"job_view_nodestats_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"job_view_polarPlotMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"job_view_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"job_view_showFootprint": true,
"job_list_usePaging": true,
"job_list_usePaging": false,
"plot_general_colorBackground": true,
"plot_general_colorscheme": []string{"#00bfff", "#0000ff", "#ff00ff", "#ff0000", "#ff8000", "#ffff00", "#80ff00"},
"plot_general_lineWidth": 3,
@ -54,20 +53,20 @@ func Init(flagConfigFile string) {
raw, err := os.ReadFile(flagConfigFile)
if err != nil {
if !os.IsNotExist(err) {
log.Fatalf("CONFIG ERROR: %v", err)
log.Abortf("Config Init: Could not read config file '%s'.\nError: %s\n", flagConfigFile, err.Error())
}
} else {
if err := schema.Validate(schema.Config, bytes.NewReader(raw)); err != nil {
log.Fatalf("Validate config: %v\n", err)
log.Abortf("Config Init: Could not validate config file '%s'.\nError: %s\n", flagConfigFile, err.Error())
}
dec := json.NewDecoder(bytes.NewReader(raw))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
log.Fatalf("could not decode: %v", err)
log.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", flagConfigFile, err.Error())
}
if Keys.Clusters == nil || len(Keys.Clusters) < 1 {
log.Fatal("At least one cluster required in config!")
log.Abort("Config Init: At least one cluster required in config. Exited with error.")
}
}
}

View File

@ -0,0 +1,44 @@
package config
import (
"encoding/json"
"os"
"strings"
)
type DefaultMetricsCluster struct {
Name string `json:"name"`
DefaultMetrics string `json:"default_metrics"`
}
type DefaultMetricsConfig struct {
Clusters []DefaultMetricsCluster `json:"clusters"`
}
func LoadDefaultMetricsConfig() (*DefaultMetricsConfig, error) {
filePath := "default_metrics.json"
if _, err := os.Stat(filePath); os.IsNotExist(err) {
return nil, nil
}
data, err := os.ReadFile(filePath)
if err != nil {
return nil, err
}
var cfg DefaultMetricsConfig
if err := json.Unmarshal(data, &cfg); err != nil {
return nil, err
}
return &cfg, nil
}
func ParseMetricsString(s string) []string {
parts := strings.Split(s, ",")
var metrics []string
for _, p := range parts {
trimmed := strings.TrimSpace(p)
if trimmed != "" {
metrics = append(metrics, trimmed)
}
}
return metrics
}

File diff suppressed because it is too large Load Diff

View File

@ -16,11 +16,23 @@ type Count struct {
Count int `json:"count"`
}
type EnergyFootprintValue struct {
Hardware string `json:"hardware"`
Metric string `json:"metric"`
Value float64 `json:"value"`
}
type FloatRange struct {
From float64 `json:"from"`
To float64 `json:"to"`
}
type FootprintValue struct {
Name string `json:"name"`
Stat string `json:"stat"`
Value float64 `json:"value"`
}
type Footprints struct {
TimeWeights *TimeWeights `json:"timeWeights"`
Metrics []*MetricFootprints `json:"metrics"`
@ -38,6 +50,7 @@ type IntRangeOutput struct {
type JobFilter struct {
Tags []string `json:"tags,omitempty"`
DbID []string `json:"dbId,omitempty"`
JobID *StringInput `json:"jobId,omitempty"`
ArrayJobID *int `json:"arrayJobId,omitempty"`
User *StringInput `json:"user,omitempty"`
@ -46,16 +59,14 @@ type JobFilter struct {
Cluster *StringInput `json:"cluster,omitempty"`
Partition *StringInput `json:"partition,omitempty"`
Duration *schema.IntRange `json:"duration,omitempty"`
Energy *FloatRange `json:"energy,omitempty"`
MinRunningFor *int `json:"minRunningFor,omitempty"`
NumNodes *schema.IntRange `json:"numNodes,omitempty"`
NumAccelerators *schema.IntRange `json:"numAccelerators,omitempty"`
NumHWThreads *schema.IntRange `json:"numHWThreads,omitempty"`
StartTime *schema.TimeRange `json:"startTime,omitempty"`
State []schema.JobState `json:"state,omitempty"`
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg,omitempty"`
MemBwAvg *FloatRange `json:"memBwAvg,omitempty"`
LoadAvg *FloatRange `json:"loadAvg,omitempty"`
MemUsedMax *FloatRange `json:"memUsedMax,omitempty"`
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
Exclusive *int `json:"exclusive,omitempty"`
Node *StringInput `json:"node,omitempty"`
}
@ -85,6 +96,19 @@ type JobResultList struct {
HasNextPage *bool `json:"hasNextPage,omitempty"`
}
type JobStats struct {
ID int `json:"id"`
JobID string `json:"jobId"`
StartTime int `json:"startTime"`
Duration int `json:"duration"`
Cluster string `json:"cluster"`
SubCluster string `json:"subCluster"`
NumNodes int `json:"numNodes"`
NumHWThreads *int `json:"numHWThreads,omitempty"`
NumAccelerators *int `json:"numAccelerators,omitempty"`
Stats []*NamedStats `json:"stats"`
}
type JobsStatistics struct {
ID string `json:"id"`
Name string `json:"name"`
@ -120,20 +144,47 @@ type MetricHistoPoint struct {
type MetricHistoPoints struct {
Metric string `json:"metric"`
Unit string `json:"unit"`
Stat *string `json:"stat,omitempty"`
Data []*MetricHistoPoint `json:"data,omitempty"`
}
type MetricStatItem struct {
MetricName string `json:"metricName"`
Range *FloatRange `json:"range"`
}
type Mutation struct {
}
type NamedStats struct {
Name string `json:"name"`
Data *schema.MetricStatistics `json:"data"`
}
type NamedStatsWithScope struct {
Name string `json:"name"`
Scope schema.MetricScope `json:"scope"`
Stats []*ScopedStats `json:"stats"`
}
type NodeMetrics struct {
Host string `json:"host"`
SubCluster string `json:"subCluster"`
Metrics []*JobMetricWithName `json:"metrics"`
}
type NodesResultList struct {
Items []*NodeMetrics `json:"items"`
Offset *int `json:"offset,omitempty"`
Limit *int `json:"limit,omitempty"`
Count *int `json:"count,omitempty"`
TotalNodes *int `json:"totalNodes,omitempty"`
HasNextPage *bool `json:"hasNextPage,omitempty"`
}
type OrderByInput struct {
Field string `json:"field"`
Type string `json:"type"`
Order SortDirectionEnum `json:"order"`
}
@ -142,7 +193,10 @@ type PageRequest struct {
Page int `json:"page"`
}
type Query struct {
type ScopedStats struct {
Hostname string `json:"hostname"`
ID *string `json:"id,omitempty"`
Data *schema.MetricStatistics `json:"data"`
}
type StringInput struct {
@ -155,8 +209,9 @@ type StringInput struct {
}
type TimeRangeOutput struct {
From time.Time `json:"from"`
To time.Time `json:"to"`
Range *string `json:"range,omitempty"`
From time.Time `json:"from"`
To time.Time `json:"to"`
}
type TimeWeights struct {
@ -197,7 +252,7 @@ func (e Aggregate) String() string {
return string(e)
}
func (e *Aggregate) UnmarshalGQL(v interface{}) error {
func (e *Aggregate) UnmarshalGQL(v any) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
@ -250,7 +305,7 @@ func (e SortByAggregate) String() string {
return string(e)
}
func (e *SortByAggregate) UnmarshalGQL(v interface{}) error {
func (e *SortByAggregate) UnmarshalGQL(v any) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
@ -291,7 +346,7 @@ func (e SortDirectionEnum) String() string {
return string(e)
}
func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
func (e *SortDirectionEnum) UnmarshalGQL(v any) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")

View File

@ -1,15 +1,39 @@
package graph
import (
"sync"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/jmoiron/sqlx"
)
// This file will not be regenerated automatically.
//
// It serves as dependency injection for your app, add any dependencies you require here.
var (
initOnce sync.Once
resolverInstance *Resolver
)
type Resolver struct {
DB *sqlx.DB
Repo *repository.JobRepository
}
func Init() {
initOnce.Do(func() {
db := repository.GetConnection()
resolverInstance = &Resolver{
DB: db.DB, Repo: repository.GetJobRepository(),
}
})
}
func GetResolverInstance() *Resolver {
if resolverInstance == nil {
log.Fatal("Authentication module not initialized!")
}
return resolverInstance
}

View File

@ -2,19 +2,22 @@ package graph
// This file will be automatically regenerated based on the schema, any resolver implementations
// will be copied through when generating and any unknown code will be moved to the end.
// Code generated by github.com/99designs/gqlgen version v0.17.45
// Code generated by github.com/99designs/gqlgen version v0.17.66
import (
"context"
"errors"
"fmt"
"regexp"
"slices"
"strconv"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
@ -28,15 +31,12 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) (
// Tags is the resolver for the tags field.
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
return r.Repo.GetTags(&obj.ID)
return r.Repo.GetTags(repository.GetUserFromContext(ctx), &obj.ID)
}
// ConcurrentJobs is the resolver for the concurrentJobs field.
func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) {
if obj.State == schema.JobStateRunning {
obj.Duration = int32(time.Now().Unix() - obj.StartTimeUnix)
}
// FIXME: Make the hardcoded duration configurable
if obj.Exclusive != 1 && obj.Duration > 600 {
return r.Repo.FindConcurrentJobs(ctx, obj)
}
@ -44,8 +44,72 @@ func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*mod
return nil, nil
}
// Footprint is the resolver for the footprint field.
func (r *jobResolver) Footprint(ctx context.Context, obj *schema.Job) ([]*model.FootprintValue, error) {
rawFootprint, err := r.Repo.FetchFootprint(obj)
if err != nil {
log.Warn("Error while fetching job footprint data")
return nil, err
}
res := []*model.FootprintValue{}
for name, value := range rawFootprint {
parts := strings.Split(name, "_")
statPart := parts[len(parts)-1]
nameParts := parts[:len(parts)-1]
res = append(res, &model.FootprintValue{
Name: strings.Join(nameParts, "_"),
Stat: statPart,
Value: value,
})
}
return res, err
}
// EnergyFootprint is the resolver for the energyFootprint field.
func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*model.EnergyFootprintValue, error) {
rawEnergyFootprint, err := r.Repo.FetchEnergyFootprint(obj)
if err != nil {
log.Warn("Error while fetching job energy footprint data")
return nil, err
}
res := []*model.EnergyFootprintValue{}
for name, value := range rawEnergyFootprint {
// Suboptimal: Nearly hardcoded metric name expectations
matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`)
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
matchCore := regexp.MustCompile(`core|Core|CORE`)
hwType := ""
switch test := name; { // NOtice ';' for var declaration
case matchCpu.MatchString(test):
hwType = "CPU"
case matchAcc.MatchString(test):
hwType = "Accelerator"
case matchMem.MatchString(test):
hwType = "Memory"
case matchCore.MatchString(test):
hwType = "Core"
default:
hwType = "Other"
}
res = append(res, &model.EnergyFootprintValue{
Hardware: hwType,
Metric: name,
Value: value,
})
}
return res, err
}
// MetaData is the resolver for the metaData field.
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) {
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (any, error) {
return r.Repo.FetchMetadata(obj)
}
@ -54,24 +118,48 @@ func (r *jobResolver) UserData(ctx context.Context, obj *schema.Job) (*model.Use
return repository.GetUserRepository().FetchUserInCtx(ctx, obj.User)
}
// Name is the resolver for the name field.
func (r *metricValueResolver) Name(ctx context.Context, obj *schema.MetricValue) (*string, error) {
panic(fmt.Errorf("not implemented: Name - name"))
}
// CreateTag is the resolver for the createTag field.
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
id, err := r.Repo.CreateTag(typeArg, name)
if err != nil {
log.Warn("Error while creating tag")
return nil, err
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string, scope string) (*schema.Tag, error) {
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && scope == "admin" ||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && scope == "global" ||
user.Username == scope {
// Create in DB
id, err := r.Repo.CreateTag(typeArg, name, scope)
if err != nil {
log.Warn("Error while creating tag")
return nil, err
}
return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil
} else {
log.Warnf("Not authorized to create tag with scope: %s", scope)
return nil, fmt.Errorf("Not authorized to create tag with scope: %s", scope)
}
}
// DeleteTag is the resolver for the deleteTag field.
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
// This Uses ID string <-> ID string, removeTagFromList uses []string <-> []int
panic(fmt.Errorf("not implemented: DeleteTag - deleteTag"))
}
// AddTagsToJob is the resolver for the addTagsToJob field.
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
jid, err := strconv.ParseInt(job, 10, 64)
if err != nil {
log.Warn("Error while adding tag to job")
@ -80,15 +168,32 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
tags := []*schema.Tag{}
for _, tagId := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagId, 10, 64)
if err != nil {
log.Warn("Error while parsing tag id")
return nil, err
}
if tags, err = r.Repo.AddTag(jid, tid); err != nil {
log.Warn("Error while adding tag")
return nil, err
// Test Exists
_, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists {
log.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("Tag does not exist (ID): %d", tid)
}
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
user.Username == tscope {
// Add to Job
if tags, err = r.Repo.AddTag(user, jid, tid); err != nil {
log.Warn("Error while adding tag")
return nil, err
}
} else {
log.Warnf("Not authorized to add tag: %d", tid)
return nil, fmt.Errorf("Not authorized to add tag: %d", tid)
}
}
@ -97,6 +202,11 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
// RemoveTagsFromJob is the resolver for the removeTagsFromJob field.
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
jid, err := strconv.ParseInt(job, 10, 64)
if err != nil {
log.Warn("Error while parsing job id")
@ -105,21 +215,80 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
tags := []*schema.Tag{}
for _, tagId := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagId, 10, 64)
if err != nil {
log.Warn("Error while parsing tag id")
return nil, err
}
if tags, err = r.Repo.RemoveTag(jid, tid); err != nil {
log.Warn("Error while removing tag")
return nil, err
// Test Exists
_, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists {
log.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("Tag does not exist (ID): %d", tid)
}
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && tscope == "admin" ||
user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) && tscope == "global" ||
user.Username == tscope {
// Remove from Job
if tags, err = r.Repo.RemoveTag(user, jid, tid); err != nil {
log.Warn("Error while removing tag")
return nil, err
}
} else {
log.Warnf("Not authorized to remove tag: %d", tid)
return nil, fmt.Errorf("Not authorized to remove tag: %d", tid)
}
}
return tags, nil
}
// RemoveTagFromList is the resolver for the removeTagFromList field.
func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []string) ([]int, error) {
// Needs Contextuser
user := repository.GetUserFromContext(ctx)
if user == nil {
return nil, fmt.Errorf("no user in context")
}
tags := []int{}
for _, tagId := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagId, 10, 64)
if err != nil {
log.Warn("Error while parsing tag id for removal")
return nil, err
}
// Test Exists
_, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists {
log.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("Tag does not exist (ID): %d", tid)
}
// Test Access: Admins && Admin Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
// Remove from DB
if err = r.Repo.RemoveTagById(tid); err != nil {
log.Warn("Error while removing tag")
return nil, err
} else {
tags = append(tags, int(tid))
}
} else {
log.Warnf("Not authorized to remove tag: %d", tid)
return nil, fmt.Errorf("Not authorized to remove tag: %d", tid)
}
}
return tags, nil
}
// UpdateConfiguration is the resolver for the updateConfiguration field.
func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string, value string) (*string, error) {
if err := repository.GetUserCfgRepo().UpdateConfig(name, value, repository.GetUserFromContext(ctx)); err != nil {
@ -137,7 +306,12 @@ func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error)
// Tags is the resolver for the tags field.
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
return r.Repo.GetTags(nil)
return r.Repo.GetTags(repository.GetUserFromContext(ctx), nil)
}
// GlobalMetrics is the resolver for the globalMetrics field.
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
return archive.GlobalMetricList, nil
}
// User is the resolver for the user field.
@ -172,7 +346,7 @@ func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error)
return nil, err
}
job, err := r.Repo.FindById(numericId)
job, err := r.Repo.FindById(ctx, numericId)
if err != nil {
log.Warn("Error while finding job by id")
return nil, err
@ -188,14 +362,24 @@ func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error)
}
// JobMetrics is the resolver for the jobMetrics field.
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.JobMetricWithName, error) {
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope, resolution *int) ([]*model.JobMetricWithName, error) {
if resolution == nil { // Load from Config
if config.Keys.EnableResampling != nil {
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
resolution = &defaultRes
} else { // Set 0 (Loads configured metric timestep)
defaultRes := 0
resolution = &defaultRes
}
}
job, err := r.Query().Job(ctx, id)
if err != nil {
log.Warn("Error while querying job for metrics")
return nil, err
}
data, err := metricdata.LoadData(job, metrics, scopes, ctx)
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
if err != nil {
log.Warn("Error while loading job data")
return nil, err
@ -215,9 +399,67 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
return res, err
}
// JobsFootprints is the resolver for the jobsFootprints field.
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
return r.jobsFootprints(ctx, filter, metrics)
// JobStats is the resolver for the jobStats field.
func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []string) ([]*model.NamedStats, error) {
job, err := r.Query().Job(ctx, id)
if err != nil {
log.Warnf("Error while querying job %s for metadata", id)
return nil, err
}
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
if err != nil {
log.Warnf("Error while loading jobStats data for job id %s", id)
return nil, err
}
res := []*model.NamedStats{}
for name, md := range data {
res = append(res, &model.NamedStats{
Name: name,
Data: &md,
})
}
return res, err
}
// ScopedJobStats is the resolver for the scopedJobStats field.
func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.NamedStatsWithScope, error) {
job, err := r.Query().Job(ctx, id)
if err != nil {
log.Warnf("Error while querying job %s for metadata", id)
return nil, err
}
data, err := metricDataDispatcher.LoadScopedJobStats(job, metrics, scopes, ctx)
if err != nil {
log.Warnf("Error while loading scopedJobStats data for job id %s", id)
return nil, err
}
res := make([]*model.NamedStatsWithScope, 0)
for name, scoped := range data {
for scope, stats := range scoped {
mdlStats := make([]*model.ScopedStats, 0)
for _, stat := range stats {
mdlStats = append(mdlStats, &model.ScopedStats{
Hostname: stat.Hostname,
ID: stat.Id,
Data: stat.Data,
})
}
res = append(res, &model.NamedStatsWithScope{
Name: name,
Scope: scope,
Stats: mdlStats,
})
}
}
return res, nil
}
// Jobs is the resolver for the jobs field.
@ -241,30 +483,39 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
return nil, err
}
if !config.Keys.UiDefaults["job_list_usePaging"].(bool) {
hasNextPage := false
page.Page += 1
nextJobs, err := r.Repo.QueryJobs(ctx, filter, page, order)
if err != nil {
log.Warn("Error while querying next jobs")
return nil, err
}
if len(nextJobs) > 0 {
hasNextPage = true
}
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
} else {
return &model.JobResultList{Items: jobs, Count: &count}, nil
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
// Users can decide in frontend to use continuous scroll, even if app-default is paging!
/*
Example Page 4 @ 10 IpP : Does item 41 exist?
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
*/
nextPage := &model.PageRequest{
ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1),
}
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
if err != nil {
log.Warn("Error while querying next jobs")
return nil, err
}
hasNextPage := false
if len(nextJobs) == 1 {
hasNextPage = true
}
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
}
// JobsStatistics is the resolver for the jobsStatistics field.
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate, numDurationBins *string, numMetricBins *int) ([]*model.JobsStatistics, error) {
var err error
var stats []*model.JobsStatistics
// Top Level Defaults
var defaultDurationBins string = "1h"
var defaultMetricBins int = 10
if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") {
if groupBy == nil {
@ -298,8 +549,13 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
}
if requireField(ctx, "histDuration") || requireField(ctx, "histNumNodes") || requireField(ctx, "histNumCores") || requireField(ctx, "histNumAccs") {
if numDurationBins == nil {
numDurationBins = &defaultDurationBins
}
if groupBy == nil {
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0])
stats[0], err = r.Repo.AddHistograms(ctx, filter, stats[0], numDurationBins)
if err != nil {
return nil, err
}
@ -309,8 +565,13 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
}
if requireField(ctx, "histMetrics") {
if numMetricBins == nil {
numMetricBins = &defaultMetricBins
}
if groupBy == nil {
stats[0], err = r.Repo.AddMetricHistograms(ctx, filter, metrics, stats[0])
stats[0], err = r.Repo.AddMetricHistograms(ctx, filter, metrics, stats[0], numMetricBins)
if err != nil {
return nil, err
}
@ -322,6 +583,62 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
return stats, nil
}
// JobsMetricStats is the resolver for the jobsMetricStats field.
func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.JobFilter, metrics []string) ([]*model.JobStats, error) {
// No Paging, Fixed Order by StartTime ASC
order := &model.OrderByInput{
Field: "startTime",
Type: "col",
Order: "ASC",
}
jobs, err := r.Repo.QueryJobs(ctx, filter, nil, order)
if err != nil {
log.Warn("Error while querying jobs for comparison")
return nil, err
}
res := []*model.JobStats{}
for _, job := range jobs {
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
if err != nil {
log.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
continue
// return nil, err
}
sres := []*model.NamedStats{}
for name, md := range data {
sres = append(sres, &model.NamedStats{
Name: name,
Data: &md,
})
}
numThreadsInt := int(job.NumHWThreads)
numAccsInt := int(job.NumAcc)
res = append(res, &model.JobStats{
ID: int(job.ID),
JobID: strconv.Itoa(int(job.JobID)),
StartTime: int(job.StartTime.Unix()),
Duration: int(job.Duration),
Cluster: job.Cluster,
SubCluster: job.SubCluster,
NumNodes: int(job.NumNodes),
NumHWThreads: &numThreadsInt,
NumAccelerators: &numAccsInt,
Stats: sres,
})
}
return res, err
}
// JobsFootprints is the resolver for the jobsFootprints field.
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
// NOTE: Legacy Naming! This resolver is for normalized histograms in analysis view only - *Not* related to DB "footprint" column!
return r.jobsFootprints(ctx, filter, metrics)
}
// RooflineHeatmap is the resolver for the rooflineHeatmap field.
func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
return r.rooflineHeatmap(ctx, filter, rows, cols, minX, minY, maxX, maxY)
@ -330,8 +647,8 @@ func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.Job
// NodeMetrics is the resolver for the nodeMetrics field.
func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) ([]*model.NodeMetrics, error) {
user := repository.GetUserFromContext(ctx)
if user != nil && !user.HasRole(schema.RoleAdmin) {
return nil, errors.New("you need to be an administrator for this query")
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
return nil, errors.New("you need to be administrator or support staff for this query")
}
if metrics == nil {
@ -340,9 +657,9 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
}
}
data, err := metricdata.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
log.Warn("Error while loading node data")
log.Warn("error while loading node data")
return nil, err
}
@ -352,7 +669,10 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
Host: hostname,
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
}
host.SubCluster, _ = archive.GetSubClusterByNode(cluster, hostname)
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
if err != nil {
log.Warnf("error in nodeMetrics resolver: %s", err)
}
for metric, scopedMetrics := range metrics {
for _, scopedMetric := range scopedMetrics {
@ -370,6 +690,68 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return nodeMetrics, nil
}
// NodeMetricsList is the resolver for the nodeMetricsList field.
func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, subCluster string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) (*model.NodesResultList, error) {
if resolution == nil { // Load from Config
if config.Keys.EnableResampling != nil {
defaultRes := slices.Max(config.Keys.EnableResampling.Resolutions)
resolution = &defaultRes
} else { // Set 0 (Loads configured metric timestep)
defaultRes := 0
resolution = &defaultRes
}
}
user := repository.GetUserFromContext(ctx)
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
return nil, errors.New("you need to be administrator or support staff for this query")
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
}
data, totalNodes, hasNextPage, err := metricDataDispatcher.LoadNodeListData(cluster, subCluster, nodeFilter, metrics, scopes, *resolution, from, to, page, ctx)
if err != nil {
log.Warn("error while loading node data")
return nil, err
}
nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
for hostname, metrics := range data {
host := &model.NodeMetrics{
Host: hostname,
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
}
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
if err != nil {
log.Warnf("error in nodeMetrics resolver: %s", err)
}
for metric, scopedMetrics := range metrics {
for scope, scopedMetric := range scopedMetrics {
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
Name: metric,
Scope: scope,
Metric: scopedMetric,
})
}
}
nodeMetricsList = append(nodeMetricsList, host)
}
nodeMetricsListResult := &model.NodesResultList{
Items: nodeMetricsList,
TotalNodes: &totalNodes,
HasNextPage: &hasNextPage,
}
return nodeMetricsListResult, nil
}
// NumberOfNodes is the resolver for the numberOfNodes field.
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
nodeList, err := archive.ParseNodeList(obj.Nodes)
@ -385,6 +767,9 @@ func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver
// Job returns generated.JobResolver implementation.
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
// MetricValue returns generated.MetricValueResolver implementation.
func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricValueResolver{r} }
// Mutation returns generated.MutationResolver implementation.
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
@ -396,6 +781,7 @@ func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subCluste
type clusterResolver struct{ *Resolver }
type jobResolver struct{ *Resolver }
type metricValueResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver }
type queryResolver struct{ *Resolver }
type subClusterResolver struct{ *Resolver }

View File

@ -11,7 +11,7 @@ import (
"github.com/99designs/gqlgen/graphql"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
// "github.com/ClusterCockpit/cc-backend/pkg/archive"
@ -24,8 +24,8 @@ func (r *queryResolver) rooflineHeatmap(
ctx context.Context,
filter []*model.JobFilter,
rows int, cols int,
minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
minX float64, minY float64, maxX float64, maxY float64,
) ([][]float64, error) {
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
if err != nil {
log.Error("Error while querying jobs for roofline")
@ -47,7 +47,14 @@ func (r *queryResolver) rooflineHeatmap(
continue
}
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
// metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
// resolution := 0
// for _, mc := range metricConfigs {
// resolution = max(resolution, mc.Timestep)
// }
jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
if err != nil {
log.Errorf("Error while loading roofline metrics for job %d", job.ID)
return nil, err
@ -120,7 +127,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
continue
}
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
log.Error("Error while loading averages for footprint")
return nil, err
}

View File

@ -8,9 +8,9 @@ import (
"bytes"
"encoding/json"
"fmt"
"math"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
@ -42,8 +42,8 @@ func HandleImportFlag(flag string) error {
}
dec := json.NewDecoder(bytes.NewReader(raw))
dec.DisallowUnknownFields()
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
if err = dec.Decode(&jobMeta); err != nil {
job := schema.JobMeta{BaseJob: schema.JobDefaults}
if err = dec.Decode(&job); err != nil {
log.Warn("Error while decoding raw json metadata for import")
return err
}
@ -67,32 +67,68 @@ func HandleImportFlag(flag string) error {
return err
}
// checkJobData(&jobData)
job.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
// if _, err = r.Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
// if err != nil {
// log.Warn("Error while finding job in jobRepository")
// return err
// }
//
// return fmt.Errorf("REPOSITORY/INIT > a job with that jobId, cluster and startTime does already exist")
// }
//
job := schema.Job{
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
StartTimeUnix: jobMeta.StartTime,
sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster)
if err != nil {
log.Errorf("cannot get subcluster: %s", err.Error())
return err
}
// TODO: Other metrics...
job.LoadAvg = loadJobStat(&jobMeta, "cpu_load")
job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any")
job.MemUsedMax = loadJobStat(&jobMeta, "mem_used")
job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw")
job.NetBwAvg = loadJobStat(&jobMeta, "net_bw")
job.FileBwAvg = loadJobStat(&jobMeta, "file_bw")
job.Footprint = make(map[string]float64)
for _, fp := range sc.Footprint {
statType := "avg"
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
statType = sc.MetricConfig[i].Footprint
}
name := fmt.Sprintf("%s_%s", fp, statType)
job.Footprint[name] = repository.LoadJobStat(&job, fp, statType)
}
job.RawFootprint, err = json.Marshal(job.Footprint)
if err != nil {
log.Warn("Error while marshaling job footprint")
return err
}
job.EnergyFootprint = make(map[string]float64)
// Total Job Energy Outside Loop
totalEnergy := 0.0
for _, fp := range sc.EnergyFootprint {
// Always Init Metric Energy Inside Loop
metricEnergy := 0.0
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
// Note: For DB data, calculate and save as kWh
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules)
log.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", job.JobID, job.Cluster, fp)
// FIXME: Needs sum as stats type
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
// Energy: Power (in Watts) * Time (in Seconds)
// Unit: (W * (s / 3600)) / 1000 = kWh
// Round 2 Digits: round(Energy * 100) / 100
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
rawEnergy := ((repository.LoadJobStat(&job, fp, "avg") * float64(job.NumNodes)) * (float64(job.Duration) / 3600.0)) / 1000.0
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
}
} else {
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
}
job.EnergyFootprint[fp] = metricEnergy
totalEnergy += metricEnergy
}
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
return err
}
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
@ -110,7 +146,7 @@ func HandleImportFlag(flag string) error {
return err
}
if err = archive.GetHandle().ImportJob(&jobMeta, &jobData); err != nil {
if err = archive.GetHandle().ImportJob(&job, &jobData); err != nil {
log.Error("Error while importing job")
return err
}
@ -122,8 +158,8 @@ func HandleImportFlag(flag string) error {
}
for _, tag := range job.Tags {
if _, err := r.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
log.Error("Error while adding or creating tag")
if err := r.ImportTag(id, tag.Type, tag.Name, tag.Scope); err != nil {
log.Error("Error while adding or creating tag on import")
return err
}
}

View File

@ -45,6 +45,9 @@ func setup(t *testing.T) *repository.JobRepository {
"jwts": {
"max-age": "2m"
},
"apiAllowedIPs": [
"*"
],
"clusters": [
{
"name": "testcluster",
@ -82,7 +85,7 @@ func setup(t *testing.T) *repository.JobRepository {
if err := os.Mkdir(jobarchive, 0777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil {
t.Fatal(err)
}
fritzArchive := filepath.Join(tmpdir, "job-archive", "fritz")

View File

@ -7,6 +7,7 @@ package importer
import (
"encoding/json"
"fmt"
"math"
"strings"
"time"
@ -16,6 +17,11 @@ import (
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
const (
addTagQuery = "INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)"
setTagQuery = "INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)"
)
// Delete the tables "job", "tag" and "jobtag" from the database and
// repopulate them using the jobs found in `archive`.
func InitDB() error {
@ -60,13 +66,66 @@ func InitDB() error {
StartTimeUnix: jobMeta.StartTime,
}
// TODO: Other metrics...
job.LoadAvg = loadJobStat(jobMeta, "cpu_load")
job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any")
job.MemUsedMax = loadJobStat(jobMeta, "mem_used")
job.MemBwAvg = loadJobStat(jobMeta, "mem_bw")
job.NetBwAvg = loadJobStat(jobMeta, "net_bw")
job.FileBwAvg = loadJobStat(jobMeta, "file_bw")
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
if err != nil {
log.Errorf("cannot get subcluster: %s", err.Error())
return err
}
job.Footprint = make(map[string]float64)
for _, fp := range sc.Footprint {
statType := "avg"
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
statType = sc.MetricConfig[i].Footprint
}
name := fmt.Sprintf("%s_%s", fp, statType)
job.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType)
}
job.RawFootprint, err = json.Marshal(job.Footprint)
if err != nil {
log.Warn("Error while marshaling job footprint")
return err
}
job.EnergyFootprint = make(map[string]float64)
// Total Job Energy Outside Loop
totalEnergy := 0.0
for _, fp := range sc.EnergyFootprint {
// Always Init Metric Energy Inside Loop
metricEnergy := 0.0
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
// Note: For DB data, calculate and save as kWh
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules)
log.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp)
// FIXME: Needs sum as stats type
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
// Energy: Power (in Watts) * Time (in Seconds)
// Unit: (W * (s / 3600)) / 1000 = kWh
// Round 2 Digits: round(Energy * 100) / 100
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
rawEnergy := ((repository.LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
}
} else {
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)
}
job.EnergyFootprint[fp] = metricEnergy
totalEnergy += metricEnergy
}
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
return err
}
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
@ -88,7 +147,8 @@ func InitDB() error {
continue
}
id, err := r.TransactionAdd(t, job)
id, err := r.TransactionAddNamed(t,
repository.NamedJobInsert, job)
if err != nil {
log.Errorf("repository initDB(): %v", err)
errorOccured++
@ -99,7 +159,9 @@ func InitDB() error {
tagstr := tag.Name + ":" + tag.Type
tagId, ok := tags[tagstr]
if !ok {
tagId, err = r.TransactionAddTag(t, tag)
tagId, err = r.TransactionAdd(t,
addTagQuery,
tag.Name, tag.Type)
if err != nil {
log.Errorf("Error adding tag: %v", err)
errorOccured++
@ -108,7 +170,9 @@ func InitDB() error {
tags[tagstr] = tagId
}
r.TransactionSetTag(t, id, tagId)
r.TransactionAdd(t,
setTagQuery,
id, tagId)
}
if err == nil {
@ -150,18 +214,6 @@ func SanityChecks(job *schema.BaseJob) error {
return nil
}
func loadJobStat(job *schema.JobMeta, metric string) float64 {
if stats, ok := job.Statistics[metric]; ok {
if metric == "mem_used" {
return stats.Max
} else {
return stats.Avg
}
}
return 0.0
}
func checkJobData(d *schema.JobData) error {
for _, scopes := range *d {
// var newUnit schema.Unit

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,383 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricDataDispatcher
import (
"context"
"fmt"
"math"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
"github.com/ClusterCockpit/cc-backend/pkg/resampler"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
func cacheKey(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) string {
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
job.ID, job.State, metrics, scopes, resolution)
}
// Fetches the metric data for a job.
func LoadData(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ interface{}, ttl time.Duration, size int) {
var jd schema.JobData
var err error
if job.State == schema.JobStateRunning ||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
config.Keys.DisableArchive {
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
}
if scopes == nil {
scopes = append(scopes, schema.MetricScopeNode)
}
if metrics == nil {
cluster := archive.GetCluster(job.Cluster)
for _, mc := range cluster.MetricConfig {
metrics = append(metrics, mc.Name)
}
}
jd, err = repo.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil {
if len(jd) != 0 {
log.Warnf("partial error: %s", err.Error())
// return err, 0, 0 // Reactivating will block archiving on one partial error
} else {
log.Error("Error while loading job data from metric repository")
return err, 0, 0
}
}
size = jd.Size()
} else {
var jd_temp schema.JobData
jd_temp, err = archive.GetHandle().LoadJobData(job)
if err != nil {
log.Error("Error while loading job data from archive")
return err, 0, 0
}
//Deep copy the cached archive hashmap
jd = metricdata.DeepCopy(jd_temp)
//Resampling for archived data.
//Pass the resolution from frontend here.
for _, v := range jd {
for _, v_ := range v {
timestep := 0
for i := 0; i < len(v_.Series); i += 1 {
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, v_.Timestep, resolution)
if err != nil {
return err, 0, 0
}
}
v_.Timestep = timestep
}
}
// Avoid sending unrequested data to the client:
if metrics != nil || scopes != nil {
if metrics == nil {
metrics = make([]string, 0, len(jd))
for k := range jd {
metrics = append(metrics, k)
}
}
res := schema.JobData{}
for _, metric := range metrics {
if perscope, ok := jd[metric]; ok {
if len(perscope) > 1 {
subset := make(map[schema.MetricScope]*schema.JobMetric)
for _, scope := range scopes {
if jm, ok := perscope[scope]; ok {
subset[scope] = jm
}
}
if len(subset) > 0 {
perscope = subset
}
}
res[metric] = perscope
}
}
jd = res
}
size = jd.Size()
}
ttl = 5 * time.Hour
if job.State == schema.JobStateRunning {
ttl = 2 * time.Minute
}
// FIXME: Review: Is this really necessary or correct.
// Note: Lines 147-170 formerly known as prepareJobData(jobData, scopes)
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
// to be available at the scope 'node'. If a job has a lot of nodes,
// statisticsSeries should be available so that a min/median/max Graph can be
// used instead of a lot of single lines.
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
// Existing (archived) StatsSeries can be 'min/mean/max'!
const maxSeriesSize int = 15
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
nodeScopeRequested := false
for _, scope := range scopes {
if scope == schema.MetricScopeNode {
nodeScopeRequested = true
}
}
if nodeScopeRequested {
jd.AddNodeScope("flops_any")
jd.AddNodeScope("mem_bw")
}
// Round Resulting Stat Values
jd.RoundMetricStats()
return jd, ttl, size
})
if err, ok := data.(error); ok {
log.Error("Error in returned dataset")
return nil, err
}
return data.(schema.JobData), nil
}
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
func LoadAverages(
job *schema.Job,
metrics []string,
data [][]schema.Float,
ctx context.Context,
) error {
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
}
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
if err != nil {
log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
return err
}
for i, m := range metrics {
nodes, ok := stats[m]
if !ok {
data[i] = append(data[i], schema.NaN)
continue
}
sum := 0.0
for _, node := range nodes {
sum += node.Avg
}
data[i] = append(data[i], schema.Float(sum))
}
return nil
}
// Used for statsTable in frontend: Return scoped statistics by metric.
func LoadScopedJobStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
}
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return nil, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
}
scopedStats, err := repo.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil {
log.Errorf("error while loading scoped statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
return nil, err
}
return scopedStats, nil
}
// Used for polar plots in frontend: Aggregates statistics for all nodes to single values for job per metric.
func LoadJobStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]schema.MetricStatistics, error) {
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
return archive.LoadStatsFromArchive(job, metrics)
}
data := make(map[string]schema.MetricStatistics, len(metrics))
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return data, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx)
if err != nil {
log.Errorf("error while loading statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
return data, err
}
for _, m := range metrics {
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
nodes, ok := stats[m]
if !ok {
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
continue
}
for _, node := range nodes {
sum += node.Avg
min = math.Min(min, node.Min)
max = math.Max(max, node.Max)
}
data[m] = schema.MetricStatistics{
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
Min: (math.Round(min*100) / 100),
Max: (math.Round(max*100) / 100),
}
}
return data, nil
}
// Used for the classic node/system view. Returns a map of nodes to a map of metrics.
func LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
repo, err := metricdata.GetMetricDataRepo(cluster)
if err != nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
}
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
log.Warnf("partial error: %s", err.Error())
} else {
log.Error("Error while loading node data from metric repository")
return nil, err
}
}
if data == nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
}
return data, nil
}
func LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]schema.JobData, int, bool, error) {
repo, err := metricdata.GetMetricDataRepo(cluster)
if err != nil {
return nil, 0, false, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
}
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, totalNodes, hasNextPage, err := repo.LoadNodeListData(cluster, subCluster, nodeFilter, metrics, scopes, resolution, from, to, page, ctx)
if err != nil {
if len(data) != 0 {
log.Warnf("partial error: %s", err.Error())
} else {
log.Error("Error while loading node data from metric repository")
return nil, totalNodes, hasNextPage, err
}
}
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
const maxSeriesSize int = 8
for _, jd := range data {
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
}
if data == nil {
return nil, totalNodes, hasNextPage, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
}
return data, totalNodes, hasNextPage, nil
}

File diff suppressed because it is too large Load Diff

View File

@ -10,9 +10,12 @@ import (
"encoding/json"
"errors"
"fmt"
"math"
"sort"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
@ -60,7 +63,10 @@ func (idb *InfluxDBv2DataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context) (schema.JobData, error) {
ctx context.Context,
resolution int) (schema.JobData, error) {
log.Infof("InfluxDB 2 Backend: Resolution Scaling not Implemented, will return default timestep. Requested Resolution %d", resolution)
measurementsConds := make([]string, 0, len(metrics))
for _, m := range metrics {
@ -84,7 +90,7 @@ func (idb *InfluxDBv2DataRepository) LoadData(
query := ""
switch scope {
case "node":
// Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows
// Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows <-- Resolution could be added here?
// log.Info("Scope 'node' requested. ")
query = fmt.Sprintf(`
from(bucket: "%s")
@ -114,6 +120,12 @@ func (idb *InfluxDBv2DataRepository) LoadData(
// idb.bucket,
// idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )),
// measurementsCond, hostsCond)
case "hwthread":
log.Info(" Scope 'hwthread' requested, but not yet supported: Will return 'node' scope only. ")
continue
case "accelerator":
log.Info(" Scope 'accelerator' requested, but not yet supported: Will return 'node' scope only. ")
continue
default:
log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope)
continue
@ -171,6 +183,11 @@ func (idb *InfluxDBv2DataRepository) LoadData(
}
case "socket":
continue
case "accelerator":
continue
case "hwthread":
// See below @ core
continue
case "core":
continue
// Include Series.Id in hostSeries
@ -299,6 +316,53 @@ func (idb *InfluxDBv2DataRepository) LoadStats(
return stats, nil
}
// Used in Job-View StatsTable
// UNTESTED
func (idb *InfluxDBv2DataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context) (schema.ScopedJobStats, error) {
// Assumption: idb.loadData() only returns series node-scope - use node scope for statsTable
scopedJobStats := make(schema.ScopedJobStats)
data, err := idb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
log.Warn("Error while loading job for scopedJobStats")
return nil, err
}
for metric, metricData := range data {
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
log.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for _, series := range metricData[scope].Series {
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: series.Hostname,
Data: &series.Statistics,
})
}
}
}
return scopedJobStats, nil
}
// Used in Systems-View @ Node-Overview
// UNTESTED
func (idb *InfluxDBv2DataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
@ -306,8 +370,206 @@ func (idb *InfluxDBv2DataRepository) LoadNodeData(
from, to time.Time,
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
// TODO : Implement to be used in Analysis- und System/Node-View
log.Infof("LoadNodeData unimplemented for InfluxDBv2DataRepository, Args: cluster %s, metrics %v, nodes %v, scopes %v", cluster, metrics, nodes, scopes)
// Note: scopes[] Array will be ignored, only return node scope
return nil, errors.New("METRICDATA/INFLUXV2 > unimplemented for InfluxDBv2DataRepository")
// CONVERT ARGS TO INFLUX
measurementsConds := make([]string, 0)
for _, m := range metrics {
measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m))
}
measurementsCond := strings.Join(measurementsConds, " or ")
hostsConds := make([]string, 0)
if nodes == nil {
var allNodes []string
subClusterNodeLists := archive.NodeLists[cluster]
for _, nodeList := range subClusterNodeLists {
allNodes = append(nodes, nodeList.PrintList()...)
}
for _, node := range allNodes {
nodes = append(nodes, node)
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, node))
}
} else {
for _, node := range nodes {
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, node))
}
}
hostsCond := strings.Join(hostsConds, " or ")
// BUILD AND PERFORM QUERY
query := fmt.Sprintf(`
from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => (%s) and (%s) )
|> drop(columns: ["_start", "_stop"])
|> group(columns: ["hostname", "_measurement"])
|> aggregateWindow(every: 60s, fn: mean)
|> drop(columns: ["_time"])`,
idb.bucket,
idb.formatTime(from), idb.formatTime(to),
measurementsCond, hostsCond)
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
log.Error("Error while performing query")
return nil, err
}
// HANDLE QUERY RETURN
// Collect Float Arrays for Node@Metric -> No Scope Handling!
influxData := make(map[string]map[string][]schema.Float)
for rows.Next() {
row := rows.Record()
host, field := row.ValueByKey("hostname").(string), row.Measurement()
influxHostData, ok := influxData[host]
if !ok {
influxHostData = make(map[string][]schema.Float)
influxData[host] = influxHostData
}
influxFieldData, ok := influxData[host][field]
if !ok {
influxFieldData = make([]schema.Float, 0)
influxData[host][field] = influxFieldData
}
val, ok := row.Value().(float64)
if ok {
influxData[host][field] = append(influxData[host][field], schema.Float(val))
} else {
influxData[host][field] = append(influxData[host][field], schema.Float(0))
}
}
// BUILD FUNCTION RETURN
data := make(map[string]map[string][]*schema.JobMetric)
for node, metricData := range influxData {
nodeData, ok := data[node]
if !ok {
nodeData = make(map[string][]*schema.JobMetric)
data[node] = nodeData
}
for metric, floatArray := range metricData {
avg, min, max := 0.0, 0.0, 0.0
for _, val := range floatArray {
avg += float64(val)
min = math.Min(min, float64(val))
max = math.Max(max, float64(val))
}
stats := schema.MetricStatistics{
Avg: (math.Round((avg/float64(len(floatArray)))*100) / 100),
Min: (math.Round(min*100) / 100),
Max: (math.Round(max*100) / 100),
}
mc := archive.GetMetricConfig(cluster, metric)
nodeData[metric] = append(nodeData[metric], &schema.JobMetric{
Unit: mc.Unit,
Timestep: mc.Timestep,
Series: []schema.Series{
{
Hostname: node,
Statistics: stats,
Data: floatArray,
},
},
})
}
}
return data, nil
}
// Used in Systems-View @ Node-List
// UNTESTED
func (idb *InfluxDBv2DataRepository) LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]schema.JobData, int, bool, error) {
// Assumption: idb.loadData() only returns series node-scope - use node scope for NodeList
// 0) Init additional vars
var totalNodes int = 0
var hasNextPage bool = false
// 1) Get list of all nodes
var nodes []string
if subCluster != "" {
scNodes := archive.NodeLists[cluster][subCluster]
nodes = scNodes.PrintList()
} else {
subClusterNodeLists := archive.NodeLists[cluster]
for _, nodeList := range subClusterNodeLists {
nodes = append(nodes, nodeList.PrintList()...)
}
}
// 2) Filter nodes
if nodeFilter != "" {
filteredNodes := []string{}
for _, node := range nodes {
if strings.Contains(node, nodeFilter) {
filteredNodes = append(filteredNodes, node)
}
}
nodes = filteredNodes
}
// 2.1) Count total nodes && Sort nodes -> Sorting invalidated after return ...
totalNodes = len(nodes)
sort.Strings(nodes)
// 3) Apply paging
if len(nodes) > page.ItemsPerPage {
start := (page.Page - 1) * page.ItemsPerPage
end := start + page.ItemsPerPage
if end > len(nodes) {
end = len(nodes)
hasNextPage = false
} else {
hasNextPage = true
}
nodes = nodes[start:end]
}
// 4) Fetch And Convert Data, use idb.LoadNodeData() for query
rawNodeData, err := idb.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
log.Error(fmt.Sprintf("Error while loading influx nodeData for nodeListData %#v\n", err))
return nil, totalNodes, hasNextPage, err
}
data := make(map[string]schema.JobData)
for node, nodeData := range rawNodeData {
// Init Nested Map Data Structures If Not Found
hostData, ok := data[node]
if !ok {
hostData = make(schema.JobData)
data[node] = hostData
}
for metric, nodeMetricData := range nodeData {
metricData, ok := hostData[metric]
if !ok {
metricData = make(map[schema.MetricScope]*schema.JobMetric)
data[node][metric] = metricData
}
data[node][metric][schema.MetricScopeNode] = nodeMetricData[0] // Only Node Scope Returned from loadNodeData
}
}
return data, totalNodes, hasNextPage, nil
}

View File

@ -8,13 +8,11 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
@ -24,21 +22,24 @@ type MetricDataRepository interface {
Init(rawConfig json.RawMessage) error
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of hosts to a map of metrics at the requested scopes for that node.
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
LoadScopedStats(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.ScopedJobStats, error)
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
LoadNodeListData(cluster, subCluster, nodeFilter string, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, page *model.PageRequest, ctx context.Context) (map[string]schema.JobData, int, bool, error)
}
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
var useArchive bool
func Init(disableArchive bool) error {
useArchive = !disableArchive
func Init() error {
for _, cluster := range config.Keys.Clusters {
if cluster.MetricDataRepository != nil {
var kind struct {
@ -73,284 +74,13 @@ func Init(disableArchive bool) error {
return nil
}
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
// Fetches the metric data for a job.
func LoadData(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.JobData, error) {
data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) {
var jd schema.JobData
var err error
if job.State == schema.JobStateRunning ||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
!useArchive {
repo, ok := metricDataRepos[job.Cluster]
if !ok {
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
}
if scopes == nil {
scopes = append(scopes, schema.MetricScopeNode)
}
if metrics == nil {
cluster := archive.GetCluster(job.Cluster)
for _, mc := range cluster.MetricConfig {
metrics = append(metrics, mc.Name)
}
}
jd, err = repo.LoadData(job, metrics, scopes, ctx)
if err != nil {
if len(jd) != 0 {
log.Errorf("partial error: %s", err.Error())
return err, 0, 0
} else {
log.Error("Error while loading job data from metric repository")
return err, 0, 0
}
}
size = jd.Size()
} else {
jd, err = archive.GetHandle().LoadJobData(job)
if err != nil {
log.Error("Error while loading job data from archive")
return err, 0, 0
}
// Avoid sending unrequested data to the client:
if metrics != nil || scopes != nil {
if metrics == nil {
metrics = make([]string, 0, len(jd))
for k := range jd {
metrics = append(metrics, k)
}
}
res := schema.JobData{}
for _, metric := range metrics {
if perscope, ok := jd[metric]; ok {
if len(perscope) > 1 {
subset := make(map[schema.MetricScope]*schema.JobMetric)
for _, scope := range scopes {
if jm, ok := perscope[scope]; ok {
subset[scope] = jm
}
}
if len(subset) > 0 {
perscope = subset
}
}
res[metric] = perscope
}
}
jd = res
}
size = jd.Size()
}
ttl = 5 * time.Hour
if job.State == schema.JobStateRunning {
ttl = 2 * time.Minute
}
prepareJobData(job, jd, scopes)
return jd, ttl, size
})
if err, ok := data.(error); ok {
log.Error("Error in returned dataset")
return nil, err
}
return data.(schema.JobData), nil
}
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
func LoadAverages(
job *schema.Job,
metrics []string,
data [][]schema.Float,
ctx context.Context,
) error {
if job.State != schema.JobStateRunning && useArchive {
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
}
repo, ok := metricDataRepos[job.Cluster]
if !ok {
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
if err != nil {
log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
return err
}
for i, m := range metrics {
nodes, ok := stats[m]
if !ok {
data[i] = append(data[i], schema.NaN)
continue
}
sum := 0.0
for _, node := range nodes {
sum += node.Avg
}
data[i] = append(data[i], schema.Float(sum))
}
return nil
}
// Used for the node/system view. Returns a map of nodes to a map of metrics.
func LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
var err error
repo, ok := metricDataRepos[cluster]
if !ok {
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
}
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
log.Warnf("partial error: %s", err.Error())
} else {
log.Error("Error while loading node data from metric repository")
return nil, err
}
}
if data == nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
}
return data, nil
}
func cacheKey(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
) string {
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
return fmt.Sprintf("%d(%s):[%v],[%v]",
job.ID, job.State, metrics, scopes)
}
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
// to be available at the scope 'node'. If a job has a lot of nodes,
// statisticsSeries should be available so that a min/mean/max Graph can be
// used instead of a lot of single lines.
func prepareJobData(
job *schema.Job,
jobData schema.JobData,
scopes []schema.MetricScope,
) {
const maxSeriesSize int = 15
for _, scopes := range jobData {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
nodeScopeRequested := false
for _, scope := range scopes {
if scope == schema.MetricScopeNode {
nodeScopeRequested = true
}
}
if nodeScopeRequested {
jobData.AddNodeScope("flops_any")
jobData.AddNodeScope("mem_bw")
}
}
// Writes a running job to the job-archive
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
allMetrics := make([]string, 0)
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name)
}
// TODO: Talk about this! What resolutions to store data at...
scopes := []schema.MetricScope{schema.MetricScopeNode}
if job.NumNodes <= 8 {
scopes = append(scopes, schema.MetricScopeCore)
}
jobData, err := LoadData(job, allMetrics, scopes, ctx)
if err != nil {
log.Error("Error wile loading job data for archiving")
return nil, err
}
jobMeta := &schema.JobMeta{
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Statistics: make(map[string]schema.JobStatistics),
}
for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
nodeData, ok := data["node"]
if !ok {
// TODO/FIXME: Calc average for non-node metrics as well!
continue
}
for _, series := range nodeData.Series {
avg += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
}
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
},
Avg: avg / float64(job.NumNodes),
Min: min,
Max: max,
}
}
// If the file based archive is disabled,
// only return the JobMeta structure as the
// statistics in there are needed.
if !useArchive {
return jobMeta, nil
}
return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData)
return repo, err
}

View File

@ -20,6 +20,7 @@ import (
"text/template"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
@ -166,10 +167,10 @@ func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
var rt http.RoundTripper = nil
if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
prom_pw := promcfg.Secret(prom_pw)
rt = promcfg.NewBasicAuthRoundTripper(config.Username, prom_pw, "", promapi.DefaultRoundTripper)
rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
} else {
if config.Username != "" {
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set.")
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
}
}
// init client
@ -204,8 +205,8 @@ func (pdb *PrometheusDataRepository) FormatQuery(
metric string,
scope schema.MetricScope,
nodes []string,
cluster string) (string, error) {
cluster string,
) (string, error) {
args := PromQLArgs{}
if len(nodes) > 0 {
args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
@ -233,12 +234,13 @@ func (pdb *PrometheusDataRepository) RowToSeries(
from time.Time,
step int64,
steps int64,
row *promm.SampleStream) schema.Series {
row *promm.SampleStream,
) schema.Series {
ts := from.Unix()
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
// init array of expected length with NaN
values := make([]schema.Float, steps+1)
for i, _ := range values {
for i := range values {
values[i] = schema.NaN
}
// copy recorded values from prom sample pair
@ -263,8 +265,9 @@ func (pdb *PrometheusDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context) (schema.JobData, error) {
ctx context.Context,
resolution int,
) (schema.JobData, error) {
// TODO respect requested scope
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
@ -306,7 +309,6 @@ func (pdb *PrometheusDataRepository) LoadData(
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
log.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
return nil, errors.New("Prometheus query error")
@ -335,7 +337,7 @@ func (pdb *PrometheusDataRepository) LoadData(
pdb.RowToSeries(from, step, steps, row))
}
// only add metric if at least one host returned data
if !ok && len(jobMetric.Series) > 0{
if !ok && len(jobMetric.Series) > 0 {
jobData[metric][scope] = jobMetric
}
// sort by hostname to get uniform coloring
@ -351,12 +353,12 @@ func (pdb *PrometheusDataRepository) LoadData(
func (pdb *PrometheusDataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
// map of metrics of nodes of stats
stats := map[string]map[string]schema.MetricStatistics{}
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx)
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
log.Warn("Error while loading job for stats")
return nil, err
@ -376,7 +378,8 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
t0 := time.Now()
// Map of hosts of metrics of value slices
data := make(map[string]map[string][]*schema.JobMetric)
@ -411,7 +414,6 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
log.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
return nil, errors.New("Prometheus query error")
@ -445,3 +447,188 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
log.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
return data, nil
}
// Implemented by NHR@FAU; Used in Job-View StatsTable
func (pdb *PrometheusDataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context) (schema.ScopedJobStats, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable
scopedJobStats := make(schema.ScopedJobStats)
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
log.Warn("Error while loading job for scopedJobStats")
return nil, err
}
for metric, metricData := range data {
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
log.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for _, series := range metricData[scope].Series {
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: series.Hostname,
Data: &series.Statistics,
})
}
}
}
return scopedJobStats, nil
}
// Implemented by NHR@FAU; Used in NodeList-View
func (pdb *PrometheusDataRepository) LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]schema.JobData, int, bool, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList
// 0) Init additional vars
var totalNodes int = 0
var hasNextPage bool = false
// 1) Get list of all nodes
var nodes []string
if subCluster != "" {
scNodes := archive.NodeLists[cluster][subCluster]
nodes = scNodes.PrintList()
} else {
subClusterNodeLists := archive.NodeLists[cluster]
for _, nodeList := range subClusterNodeLists {
nodes = append(nodes, nodeList.PrintList()...)
}
}
// 2) Filter nodes
if nodeFilter != "" {
filteredNodes := []string{}
for _, node := range nodes {
if strings.Contains(node, nodeFilter) {
filteredNodes = append(filteredNodes, node)
}
}
nodes = filteredNodes
}
// 2.1) Count total nodes && Sort nodes -> Sorting invalidated after return ...
totalNodes = len(nodes)
sort.Strings(nodes)
// 3) Apply paging
if len(nodes) > page.ItemsPerPage {
start := (page.Page - 1) * page.ItemsPerPage
end := start + page.ItemsPerPage
if end > len(nodes) {
end = len(nodes)
hasNextPage = false
} else {
hasNextPage = true
}
nodes = nodes[start:end]
}
// 4) Fetch Data, based on pdb.LoadNodeData()
t0 := time.Now()
// Map of hosts of jobData
data := make(map[string]schema.JobData)
// query db for each metric
// TODO: scopes seems to be always empty
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
log.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(cluster, metric)
if metricConfig == nil {
log.Warnf("Error in LoadNodeListData: Metric %s for cluster %s not configured", metric, cluster)
return nil, totalNodes, hasNextPage, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
if err != nil {
log.Warn("Error while formatting prometheus query")
return nil, totalNodes, hasNextPage, err
}
// ranged query over all nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
log.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
return nil, totalNodes, hasNextPage, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
log.Warnf("Warnings: %v\n", warnings)
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
hostdata, ok := data[hostname]
if !ok {
hostdata = make(schema.JobData)
data[hostname] = hostdata
}
metricdata, ok := hostdata[metric]
if !ok {
metricdata = make(map[schema.MetricScope]*schema.JobMetric)
data[hostname][metric] = metricdata
}
// output per host, metric and scope
scopeData, ok := metricdata[scope]
if !ok {
scopeData = &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
}
data[hostname][metric][scope] = scopeData
}
}
}
}
t1 := time.Since(t0)
log.Debugf("LoadNodeListData of %v nodes took %s", len(data), t1)
return data, totalNodes, hasNextPage, nil
}

View File

@ -9,10 +9,11 @@ import (
"encoding/json"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
panic("TODO")
}
@ -27,14 +28,25 @@ func (tmdr *TestMetricDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context) (schema.JobData, error) {
ctx context.Context,
resolution int) (schema.JobData, error) {
return TestLoadDataCallback(job, metrics, scopes, ctx)
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
}
func (tmdr *TestMetricDataRepository) LoadStats(
job *schema.Job,
metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
metrics []string,
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context) (schema.ScopedJobStats, error) {
panic("TODO")
}
@ -48,3 +60,62 @@ func (tmdr *TestMetricDataRepository) LoadNodeData(
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]schema.JobData, int, bool, error) {
panic("TODO")
}
func DeepCopy(jd_temp schema.JobData) schema.JobData {
var jd schema.JobData
jd = make(schema.JobData, len(jd_temp))
for k, v := range jd_temp {
jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jd_temp[k]))
for k_, v_ := range v {
jd[k][k_] = new(schema.JobMetric)
jd[k][k_].Series = make([]schema.Series, len(v_.Series))
for i := 0; i < len(v_.Series); i += 1 {
jd[k][k_].Series[i].Data = make([]schema.Float, len(v_.Series[i].Data))
copy(jd[k][k_].Series[i].Data, v_.Series[i].Data)
jd[k][k_].Series[i].Hostname = v_.Series[i].Hostname
jd[k][k_].Series[i].Id = v_.Series[i].Id
jd[k][k_].Series[i].Statistics.Avg = v_.Series[i].Statistics.Avg
jd[k][k_].Series[i].Statistics.Min = v_.Series[i].Statistics.Min
jd[k][k_].Series[i].Statistics.Max = v_.Series[i].Statistics.Max
}
jd[k][k_].Timestep = v_.Timestep
jd[k][k_].Unit.Base = v_.Unit.Base
jd[k][k_].Unit.Prefix = v_.Unit.Prefix
if v_.StatisticsSeries != nil {
// Init Slices
jd[k][k_].StatisticsSeries = new(schema.StatsSeries)
jd[k][k_].StatisticsSeries.Max = make([]schema.Float, len(v_.StatisticsSeries.Max))
jd[k][k_].StatisticsSeries.Min = make([]schema.Float, len(v_.StatisticsSeries.Min))
jd[k][k_].StatisticsSeries.Median = make([]schema.Float, len(v_.StatisticsSeries.Median))
jd[k][k_].StatisticsSeries.Mean = make([]schema.Float, len(v_.StatisticsSeries.Mean))
// Copy Data
copy(jd[k][k_].StatisticsSeries.Max, v_.StatisticsSeries.Max)
copy(jd[k][k_].StatisticsSeries.Min, v_.StatisticsSeries.Min)
copy(jd[k][k_].StatisticsSeries.Median, v_.StatisticsSeries.Median)
copy(jd[k][k_].StatisticsSeries.Mean, v_.StatisticsSeries.Mean)
// Handle Percentiles
for k__, v__ := range v_.StatisticsSeries.Percentiles {
jd[k][k_].StatisticsSeries.Percentiles[k__] = make([]schema.Float, len(v__))
copy(jd[k][k_].StatisticsSeries.Percentiles[k__], v__)
}
} else {
jd[k][k_].StatisticsSeries = v_.StatisticsSeries
}
}
}
return jd
}

View File

@ -59,17 +59,15 @@ func Connect(driver string, db string) {
} else {
dbHandle, err = sqlx.Open("sqlite3", opts.URL)
}
if err != nil {
log.Fatal(err)
}
case "mysql":
opts.URL += "?multiStatements=true"
dbHandle, err = sqlx.Open("mysql", opts.URL)
if err != nil {
log.Fatalf("sqlx.Open() error: %v", err)
}
default:
log.Fatalf("unsupported database driver: %s", driver)
log.Abortf("DB Connection: Unsupported database driver '%s'.\n", driver)
}
if err != nil {
log.Abortf("DB Connection: Could not connect to '%s' database with sqlx.Open().\nError: %s\n", driver, err.Error())
}
dbHandle.SetMaxOpenConns(opts.MaxOpenConnections)
@ -80,7 +78,7 @@ func Connect(driver string, db string) {
dbConnInstance = &DBConnection{DB: dbHandle, Driver: driver}
err = checkDBVersion(driver, dbHandle.DB)
if err != nil {
log.Fatal(err)
log.Abortf("DB Connection: Failed DB version check.\nError: %s\n", err.Error())
}
})
}

View File

@ -5,17 +5,16 @@
package repository
import (
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"math"
"strconv"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
@ -30,12 +29,10 @@ var (
)
type JobRepository struct {
DB *sqlx.DB
stmtCache *sq.StmtCache
cache *lrucache.Cache
archiveChannel chan *schema.Job
driver string
archivePending sync.WaitGroup
DB *sqlx.DB
stmtCache *sq.StmtCache
cache *lrucache.Cache
driver string
}
func GetJobRepository() *JobRepository {
@ -46,47 +43,48 @@ func GetJobRepository() *JobRepository {
DB: db.DB,
driver: db.Driver,
stmtCache: sq.NewStmtCache(db.DB),
cache: lrucache.New(1024 * 1024),
archiveChannel: make(chan *schema.Job, 128),
stmtCache: sq.NewStmtCache(db.DB),
cache: lrucache.New(1024 * 1024),
}
// start archiving worker
go jobRepoInstance.archivingWorker()
})
return jobRepoInstance
}
var jobColumns []string = []string{
"job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.partition", "job.array_job_id",
"job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id",
"job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state",
"job.duration", "job.walltime", "job.resources", "job.mem_used_max", "job.flops_any_avg", "job.mem_bw_avg", "job.load_avg", // "job.meta_data",
"job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy",
}
func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
job := &schema.Job{}
if err := row.Scan(
&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId,
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State,
&job.Duration, &job.Walltime, &job.RawResources, &job.MemUsedMax, &job.FlopsAnyAvg, &job.MemBwAvg, &job.LoadAvg /*&job.RawMetaData*/); err != nil {
&job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint, &job.Energy); err != nil {
log.Warnf("Error while scanning rows (Job): %v", err)
return nil, err
}
if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
log.Warn("Error while unmarhsaling raw resources json")
log.Warn("Error while unmarshaling raw resources json")
return nil, err
}
job.RawResources = nil
// if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
// return nil, err
// }
if err := json.Unmarshal(job.RawFootprint, &job.Footprint); err != nil {
log.Warnf("Error while unmarshaling raw footprint json: %v", err)
return nil, err
}
job.RawFootprint = nil
job.StartTime = time.Unix(job.StartTimeUnix, 0)
if job.Duration == 0 && job.State == schema.JobStateRunning {
// Always ensure accurate duration for running jobs
if job.State == schema.JobStateRunning {
job.Duration = int32(time.Since(job.StartTime).Seconds())
}
job.RawResources = nil
return job, nil
}
@ -205,7 +203,10 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
return err
}
if _, err = sq.Update("job").Set("meta_data", job.RawMetaData).Where("job.id = ?", job.ID).RunWith(r.stmtCache).Exec(); err != nil {
if _, err = sq.Update("job").
Set("meta_data", job.RawMetaData).
Where("job.id = ?", job.ID).
RunWith(r.stmtCache).Exec(); err != nil {
log.Warnf("Error while updating metadata for job, DB ID '%v'", job.ID)
return err
}
@ -214,222 +215,54 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
return archive.UpdateMetadata(job, job.MetaData)
}
// Find executes a SQL query to find a specific batch job.
// The job is queried using the batch job id, the cluster name,
// and the start time of the job in UNIX epoch time seconds.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) Find(
jobId *int64,
cluster *string,
startTime *int64,
) (*schema.Job, error) {
func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, error) {
start := time.Now()
q := sq.Select(jobColumns...).From("job").
Where("job.job_id = ?", *jobId)
if cluster != nil {
q = q.Where("job.cluster = ?", *cluster)
}
if startTime != nil {
q = q.Where("job.start_time = ?", *startTime)
}
log.Debugf("Timer Find %s", time.Since(start))
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// Find executes a SQL query to find a specific batch job.
// The job is queried using the batch job id, the cluster name,
// and the start time of the job in UNIX epoch time seconds.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindAll(
jobId *int64,
cluster *string,
startTime *int64,
) ([]*schema.Job, error) {
start := time.Now()
q := sq.Select(jobColumns...).From("job").
Where("job.job_id = ?", *jobId)
if cluster != nil {
q = q.Where("job.cluster = ?", *cluster)
}
if startTime != nil {
q = q.Where("job.start_time = ?", *startTime)
}
rows, err := q.RunWith(r.stmtCache).Query()
if err != nil {
log.Error("Error while running query")
if err := sq.Select("job.footprint").From("job").Where("job.id = ?", job.ID).
RunWith(r.stmtCache).QueryRow().Scan(&job.RawFootprint); err != nil {
log.Warn("Error while scanning for job footprint")
return nil, err
}
jobs := make([]*schema.Job, 0, 10)
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
jobs = append(jobs, job)
}
log.Debugf("Timer FindAll %s", time.Since(start))
return jobs, nil
}
// FindById executes a SQL query to find a specific batch job.
// The job is queried using the database id.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindById(jobId int64) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").Where("job.id = ?", jobId)
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
func (r *JobRepository) FindConcurrentJobs(
ctx context.Context,
job *schema.Job,
) (*model.JobLinkResultList, error) {
if job == nil {
if len(job.RawFootprint) == 0 {
return nil, nil
}
query, qerr := SecurityCheck(ctx, sq.Select("job.id", "job.job_id", "job.start_time").From("job"))
if qerr != nil {
return nil, qerr
}
query = query.Where("cluster = ?", job.Cluster)
var startTime int64
var stopTime int64
startTime = job.StartTimeUnix
hostname := job.Resources[0].Hostname
if job.State == schema.JobStateRunning {
stopTime = time.Now().Unix()
} else {
stopTime = startTime + int64(job.Duration)
}
// Add 200s overlap for jobs start time at the end
startTimeTail := startTime + 10
stopTimeTail := stopTime - 200
startTimeFront := startTime + 200
queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
"running", startTimeTail, stopTimeTail, startTime)
queryRunning = queryRunning.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%"))
query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
query = query.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%"))
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
log.Errorf("Error while running query: %v", err)
if err := json.Unmarshal(job.RawFootprint, &job.Footprint); err != nil {
log.Warn("Error while unmarshaling raw footprint json")
return nil, err
}
items := make([]*model.JobLink, 0, 10)
queryString := fmt.Sprintf("cluster=%s", job.Cluster)
log.Debugf("Timer FetchFootprint %s", time.Since(start))
return job.Footprint, nil
}
for rows.Next() {
var id, jobId, startTime sql.NullInt64
if err = rows.Scan(&id, &jobId, &startTime); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
if id.Valid {
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
items = append(items,
&model.JobLink{
ID: fmt.Sprint(id.Int64),
JobID: int(jobId.Int64),
})
}
func (r *JobRepository) FetchEnergyFootprint(job *schema.Job) (map[string]float64, error) {
start := time.Now()
cachekey := fmt.Sprintf("energyFootprint:%d", job.ID)
if cached := r.cache.Get(cachekey, nil); cached != nil {
job.EnergyFootprint = cached.(map[string]float64)
return job.EnergyFootprint, nil
}
rows, err = queryRunning.RunWith(r.stmtCache).Query()
if err != nil {
log.Errorf("Error while running query: %v", err)
if err := sq.Select("job.energy_footprint").From("job").Where("job.id = ?", job.ID).
RunWith(r.stmtCache).QueryRow().Scan(&job.RawEnergyFootprint); err != nil {
log.Warn("Error while scanning for job energy_footprint")
return nil, err
}
for rows.Next() {
var id, jobId, startTime sql.NullInt64
if err := rows.Scan(&id, &jobId, &startTime); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
if id.Valid {
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
items = append(items,
&model.JobLink{
ID: fmt.Sprint(id.Int64),
JobID: int(jobId.Int64),
})
}
if len(job.RawEnergyFootprint) == 0 {
return nil, nil
}
cnt := len(items)
return &model.JobLinkResultList{
ListQuery: &queryString,
Items: items,
Count: &cnt,
}, nil
}
// Start inserts a new job in the table, returning the unique job ID.
// Statistics are not transfered!
func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) {
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
if err := json.Unmarshal(job.RawEnergyFootprint, &job.EnergyFootprint); err != nil {
log.Warn("Error while unmarshaling raw energy footprint json")
return nil, err
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
}
res, err := r.DB.NamedExec(`INSERT INTO job (
job_id, user, project, cluster, subcluster, `+"`partition`"+`, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data
) VALUES (
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data
);`, job)
if err != nil {
return -1, err
}
return res.LastInsertId()
}
// Stop updates the job with the database id jobId using the provided arguments.
func (r *JobRepository) Stop(
jobId int64,
duration int32,
state schema.JobState,
monitoringStatus int32,
) (err error) {
stmt := sq.Update("job").
Set("job_state", state).
Set("duration", duration).
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", jobId)
_, err = stmt.RunWith(r.stmtCache).Exec()
return
r.cache.Put(cachekey, job.EnergyFootprint, len(job.EnergyFootprint), 24*time.Hour)
log.Debugf("Timer FetchEnergyFootprint %s", time.Since(start))
return job.EnergyFootprint, nil
}
func (r *JobRepository) DeleteJobsBefore(startTime int64) (int, error) {
@ -461,119 +294,22 @@ func (r *JobRepository) DeleteJobById(id int64) error {
return err
}
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
stmt := sq.Update("job").
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", job)
_, err = stmt.RunWith(r.stmtCache).Exec()
return
}
// Stop updates the job with the database id jobId using the provided arguments.
func (r *JobRepository) MarkArchived(
jobId int64,
monitoringStatus int32,
metricStats map[string]schema.JobStatistics,
) error {
stmt := sq.Update("job").
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", jobId)
for metric, stats := range metricStats {
switch metric {
case "flops_any":
stmt = stmt.Set("flops_any_avg", stats.Avg)
case "mem_used":
stmt = stmt.Set("mem_used_max", stats.Max)
case "mem_bw":
stmt = stmt.Set("mem_bw_avg", stats.Avg)
case "load":
stmt = stmt.Set("load_avg", stats.Avg)
case "cpu_load":
stmt = stmt.Set("load_avg", stats.Avg)
case "net_bw":
stmt = stmt.Set("net_bw_avg", stats.Avg)
case "file_bw":
stmt = stmt.Set("file_bw_avg", stats.Avg)
default:
log.Debugf("MarkArchived() Metric '%v' unknown", metric)
}
}
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
log.Warn("Error while marking job as archived")
return err
}
return nil
}
// Archiving worker thread
func (r *JobRepository) archivingWorker() {
for {
select {
case job, ok := <-r.archiveChannel:
if !ok {
break
}
start := time.Now()
// not using meta data, called to load JobMeta into Cache?
// will fail if job meta not in repository
if _, err := r.FetchMetadata(job); err != nil {
log.Errorf("archiving job (dbid: %d) failed: %s", job.ID, err.Error())
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
continue
}
// metricdata.ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
// TODO: Maybe use context with cancel/timeout here
jobMeta, err := metricdata.ArchiveJob(job, context.Background())
if err != nil {
log.Errorf("archiving job (dbid: %d) failed: %s", job.ID, err.Error())
r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
continue
}
// Update the jobs database entry one last time:
if err := r.MarkArchived(job.ID, schema.MonitoringStatusArchivingSuccessful, jobMeta.Statistics); err != nil {
log.Errorf("archiving job (dbid: %d) failed: %s", job.ID, err.Error())
continue
}
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
log.Printf("archiving job (dbid: %d) successful", job.ID)
r.archivePending.Done()
}
}
}
// Trigger async archiving
func (r *JobRepository) TriggerArchiving(job *schema.Job) {
r.archivePending.Add(1)
r.archiveChannel <- job
}
// Wait for background thread to finish pending archiving operations
func (r *JobRepository) WaitForArchiving() {
// close channel and wait for worker to process remaining jobs
r.archivePending.Wait()
}
func (r *JobRepository) FindUserOrProjectOrJobname(user *schema.User, searchterm string) (jobid string, username string, project string, jobname string) {
if _, err := strconv.Atoi(searchterm); err == nil { // Return empty on successful conversion: parent method will redirect for integer jobId
return searchterm, "", "", ""
} else { // Has to have letters and logged-in user for other guesses
if user != nil {
// Find username in jobs (match)
uresult, _ := r.FindColumnValue(user, searchterm, "job", "user", "user", false)
// Find username by username in job table (match)
uresult, _ := r.FindColumnValue(user, searchterm, "job", "hpc_user", "hpc_user", false)
if uresult != "" {
return "", uresult, "", ""
}
// Find username by name (like)
nresult, _ := r.FindColumnValue(user, searchterm, "user", "username", "name", true)
// Find username by real name in hpc_user table (like)
nresult, _ := r.FindColumnValue(user, searchterm, "hpc_user", "username", "name", true)
if nresult != "" {
return "", nresult, "", ""
}
// Find projectId in jobs (match)
// Find projectId by projectId in job table (match)
presult, _ := r.FindColumnValue(user, searchterm, "job", "project", "project", false)
if presult != "" {
return "", "", presult, ""
@ -655,7 +391,7 @@ func (r *JobRepository) Partitions(cluster string) ([]string, error) {
start := time.Now()
partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) {
parts := []string{}
if err = r.DB.Select(&parts, `SELECT DISTINCT job.partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
return nil, 0, 1000
}
@ -712,6 +448,7 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
return subclusters, nil
}
// FIXME: Set duration to requested walltime?
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
start := time.Now()
res, err := sq.Update("job").
@ -740,6 +477,46 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
return nil
}
func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) {
query := sq.Select(jobColumns...).From("job").
Where(fmt.Sprintf("job.cluster = '%s'", cluster)).
Where("job.job_state = 'running'").
Where("job.duration > 600")
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
log.Error("Error while running query")
return nil, err
}
jobs := make([]*schema.Job, 0, 50)
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
rows.Close()
log.Warn("Error while scanning rows")
return nil, err
}
jobs = append(jobs, job)
}
log.Infof("Return job count %d", len(jobs))
return jobs, nil
}
func (r *JobRepository) UpdateDuration() error {
stmnt := sq.Update("job").
Set("duration", sq.Expr("? - job.start_time", time.Now().Unix())).
Where("job_state = 'running'")
_, err := stmnt.RunWith(r.stmtCache).Exec()
if err != nil {
return err
}
return nil
}
func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) {
var query sq.SelectBuilder
@ -778,27 +555,118 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64
return jobs, nil
}
const NamedJobInsert string = `INSERT INTO job (
job_id, user, project, cluster, subcluster, ` + "`partition`" + `, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data,
mem_used_max, flops_any_avg, mem_bw_avg, load_avg, net_bw_avg, net_data_vol_total, file_bw_avg, file_data_vol_total
) VALUES (
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data,
:mem_used_max, :flops_any_avg, :mem_bw_avg, :load_avg, :net_bw_avg, :net_data_vol_total, :file_bw_avg, :file_data_vol_total
);`
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
stmt := sq.Update("job").
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", job)
func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {
res, err := r.DB.NamedExec(NamedJobInsert, job)
if err != nil {
log.Warn("Error while NamedJobInsert")
return 0, err
}
id, err := res.LastInsertId()
if err != nil {
log.Warn("Error while getting last insert ID")
return 0, err
}
return id, nil
_, err = stmt.RunWith(r.stmtCache).Exec()
return
}
func (r *JobRepository) Execute(stmt sq.UpdateBuilder) error {
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
return err
}
return nil
}
func (r *JobRepository) MarkArchived(
stmt sq.UpdateBuilder,
monitoringStatus int32,
) sq.UpdateBuilder {
return stmt.Set("monitoring_status", monitoringStatus)
}
func (r *JobRepository) UpdateEnergy(
stmt sq.UpdateBuilder,
jobMeta *schema.JobMeta,
) (sq.UpdateBuilder, error) {
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
if err != nil {
log.Errorf("cannot get subcluster: %s", err.Error())
return stmt, err
}
energyFootprint := make(map[string]float64)
// Total Job Energy Outside Loop
totalEnergy := 0.0
for _, fp := range sc.EnergyFootprint {
// Always Init Metric Energy Inside Loop
metricEnergy := 0.0
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
// Note: For DB data, calculate and save as kWh
if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules or Wh)
log.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp)
// FIXME: Needs sum as stats type
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
// Energy: Power (in Watts) * Time (in Seconds)
// Unit: (W * (s / 3600)) / 1000 = kWh
// Round 2 Digits: round(Energy * 100) / 100
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
rawEnergy := ((LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
}
} else {
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)
}
energyFootprint[fp] = metricEnergy
totalEnergy += metricEnergy
// log.Infof("Metric %s Average %f -> %f kWh | Job %d Total -> %f kWh", fp, LoadJobStat(jobMeta, fp, "avg"), energy, jobMeta.JobID, totalEnergy)
}
var rawFootprint []byte
if rawFootprint, err = json.Marshal(energyFootprint); err != nil {
log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
return stmt, err
}
return stmt.Set("energy_footprint", string(rawFootprint)).Set("energy", (math.Round(totalEnergy*100.0) / 100.0)), nil
}
func (r *JobRepository) UpdateFootprint(
stmt sq.UpdateBuilder,
jobMeta *schema.JobMeta,
) (sq.UpdateBuilder, error) {
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
if err != nil {
log.Errorf("cannot get subcluster: %s", err.Error())
return stmt, err
}
footprint := make(map[string]float64)
for _, fp := range sc.Footprint {
var statType string
for _, gm := range archive.GlobalMetricList {
if gm.Name == fp {
statType = gm.Footprint
}
}
if statType != "avg" && statType != "min" && statType != "max" {
log.Warnf("unknown statType for footprint update: %s", statType)
return stmt, fmt.Errorf("unknown statType for footprint update: %s", statType)
}
if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
statType = sc.MetricConfig[i].Footprint
}
name := fmt.Sprintf("%s_%s", fp, statType)
footprint[name] = LoadJobStat(jobMeta, fp, statType)
}
var rawFootprint []byte
if rawFootprint, err = json.Marshal(footprint); err != nil {
log.Warnf("Error while marshaling footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
return stmt, err
}
return stmt.Set("footprint", string(rawFootprint)), nil
}

View File

@ -0,0 +1,75 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"encoding/json"
"fmt"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
)
const NamedJobInsert string = `INSERT INTO job (
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
) VALUES (
:job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
);`
func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) {
res, err := r.DB.NamedExec(NamedJobInsert, job)
if err != nil {
log.Warn("Error while NamedJobInsert")
return 0, err
}
id, err := res.LastInsertId()
if err != nil {
log.Warn("Error while getting last insert ID")
return 0, err
}
return id, nil
}
// Start inserts a new job in the table, returning the unique job ID.
// Statistics are not transfered!
func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) {
job.RawFootprint, err = json.Marshal(job.Footprint)
if err != nil {
return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err)
}
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
}
return r.InsertJob(job)
}
// Stop updates the job with the database id jobId using the provided arguments.
func (r *JobRepository) Stop(
jobId int64,
duration int32,
state schema.JobState,
monitoringStatus int32,
) (err error) {
stmt := sq.Update("job").
Set("job_state", state).
Set("duration", duration).
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", jobId)
_, err = stmt.RunWith(r.stmtCache).Exec()
return
}

View File

@ -0,0 +1,263 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"context"
"database/sql"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
)
// Find executes a SQL query to find a specific batch job.
// The job is queried using the batch job id, the cluster name,
// and the start time of the job in UNIX epoch time seconds.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) Find(
jobId *int64,
cluster *string,
startTime *int64,
) (*schema.Job, error) {
start := time.Now()
q := sq.Select(jobColumns...).From("job").
Where("job.job_id = ?", *jobId)
if cluster != nil {
q = q.Where("job.cluster = ?", *cluster)
}
if startTime != nil {
q = q.Where("job.start_time = ?", *startTime)
}
q = q.OrderBy("job.id DESC") // always use newest matching job by db id if more than one match
log.Debugf("Timer Find %s", time.Since(start))
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// Find executes a SQL query to find a specific batch job.
// The job is queried using the batch job id, the cluster name,
// and the start time of the job in UNIX epoch time seconds.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindAll(
jobId *int64,
cluster *string,
startTime *int64,
) ([]*schema.Job, error) {
start := time.Now()
q := sq.Select(jobColumns...).From("job").
Where("job.job_id = ?", *jobId)
if cluster != nil {
q = q.Where("job.cluster = ?", *cluster)
}
if startTime != nil {
q = q.Where("job.start_time = ?", *startTime)
}
rows, err := q.RunWith(r.stmtCache).Query()
if err != nil {
log.Error("Error while running query")
return nil, err
}
jobs := make([]*schema.Job, 0, 10)
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
jobs = append(jobs, job)
}
log.Debugf("Timer FindAll %s", time.Since(start))
return jobs, nil
}
// FindById executes a SQL query to find a specific batch job.
// The job is queried using the database id.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").Where("job.id = ?", jobId)
q, qerr := SecurityCheck(ctx, q)
if qerr != nil {
return nil, qerr
}
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// FindByIdWithUser executes a SQL query to find a specific batch job.
// The job is queried using the database id. The user is passed directly,
// instead as part of the context.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").Where("job.id = ?", jobId)
q, qerr := SecurityCheckWithUser(user, q)
if qerr != nil {
return nil, qerr
}
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// FindByIdDirect executes a SQL query to find a specific batch job.
// The job is queried using the database id.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindByIdDirect(jobId int64) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").Where("job.id = ?", jobId)
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// FindByJobId executes a SQL query to find a specific batch job.
// The job is queried using the slurm id and the clustername.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime int64, cluster string) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").
Where("job.job_id = ?", jobId).
Where("job.cluster = ?", cluster).
Where("job.start_time = ?", startTime)
q, qerr := SecurityCheck(ctx, q)
if qerr != nil {
return nil, qerr
}
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// IsJobOwner executes a SQL query to find a specific batch job.
// The job is queried using the slurm id,a username and the cluster.
// It returns a bool.
// If job was found, user is owner: test err != sql.ErrNoRows
func (r *JobRepository) IsJobOwner(jobId int64, startTime int64, user string, cluster string) bool {
q := sq.Select("id").
From("job").
Where("job.job_id = ?", jobId).
Where("job.hpc_user = ?", user).
Where("job.cluster = ?", cluster).
Where("job.start_time = ?", startTime)
_, err := scanJob(q.RunWith(r.stmtCache).QueryRow())
return err != sql.ErrNoRows
}
func (r *JobRepository) FindConcurrentJobs(
ctx context.Context,
job *schema.Job,
) (*model.JobLinkResultList, error) {
if job == nil {
return nil, nil
}
query, qerr := SecurityCheck(ctx, sq.Select("job.id", "job.job_id", "job.start_time").From("job"))
if qerr != nil {
return nil, qerr
}
query = query.Where("cluster = ?", job.Cluster)
var startTime int64
var stopTime int64
startTime = job.StartTimeUnix
hostname := job.Resources[0].Hostname
if job.State == schema.JobStateRunning {
stopTime = time.Now().Unix()
} else {
stopTime = startTime + int64(job.Duration)
}
// Add 200s overlap for jobs start time at the end
startTimeTail := startTime + 10
stopTimeTail := stopTime - 200
startTimeFront := startTime + 200
queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
"running", startTimeTail, stopTimeTail, startTime)
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
queryRunning = queryRunning.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
query = query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
log.Errorf("Error while running query: %v", err)
return nil, err
}
items := make([]*model.JobLink, 0, 10)
queryString := fmt.Sprintf("cluster=%s", job.Cluster)
for rows.Next() {
var id, jobId, startTime sql.NullInt64
if err = rows.Scan(&id, &jobId, &startTime); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
if id.Valid {
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
items = append(items,
&model.JobLink{
ID: fmt.Sprint(id.Int64),
JobID: int(jobId.Int64),
})
}
}
rows, err = queryRunning.RunWith(r.stmtCache).Query()
if err != nil {
log.Errorf("Error while running query: %v", err)
return nil, err
}
for rows.Next() {
var id, jobId, startTime sql.NullInt64
if err := rows.Scan(&id, &jobId, &startTime); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
if id.Valid {
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
items = append(items,
&model.JobLink{
ID: fmt.Sprint(id.Int64),
JobID: int(jobId.Int64),
})
}
}
cnt := len(items)
return &model.JobLinkResultList{
ListQuery: &queryString,
Items: items,
Count: &cnt,
}, nil
}

View File

@ -0,0 +1,349 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"context"
"errors"
"fmt"
"regexp"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
)
func (r *JobRepository) QueryJobs(
ctx context.Context,
filters []*model.JobFilter,
page *model.PageRequest,
order *model.OrderByInput,
) ([]*schema.Job, error) {
query, qerr := SecurityCheck(ctx, sq.Select(jobColumns...).From("job"))
if qerr != nil {
return nil, qerr
}
if order != nil {
field := toSnakeCase(order.Field)
if order.Type == "col" {
// "col": Fixed column name query
switch order.Order {
case model.SortDirectionEnumAsc:
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
case model.SortDirectionEnumDesc:
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
default:
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for column")
}
} else {
// "foot": Order by footprint JSON field values
// Verify and Search Only in Valid Jsons
query = query.Where("JSON_VALID(meta_data)")
switch order.Order {
case model.SortDirectionEnumAsc:
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") ASC", field))
case model.SortDirectionEnumDesc:
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") DESC", field))
default:
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for footprint")
}
}
}
if page != nil && page.ItemsPerPage != -1 {
limit := uint64(page.ItemsPerPage)
query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit)
}
for _, f := range filters {
query = BuildWhereClause(f, query)
}
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
queryString, queryVars, _ := query.ToSql()
log.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
return nil, err
}
jobs := make([]*schema.Job, 0, 50)
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
rows.Close()
log.Warn("Error while scanning rows (Jobs)")
return nil, err
}
jobs = append(jobs, job)
}
return jobs, nil
}
func (r *JobRepository) CountJobs(
ctx context.Context,
filters []*model.JobFilter,
) (int, error) {
// DISTICT count for tags filters, does not affect other queries
query, qerr := SecurityCheck(ctx, sq.Select("count(DISTINCT job.id)").From("job"))
if qerr != nil {
return 0, qerr
}
for _, f := range filters {
query = BuildWhereClause(f, query)
}
var count int
if err := query.RunWith(r.DB).Scan(&count); err != nil {
return 0, err
}
return count, nil
}
func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBuilder, error) {
if user == nil {
var qnil sq.SelectBuilder
return qnil, fmt.Errorf("user context is nil")
}
switch {
case len(user.Roles) == 1 && user.HasRole(schema.RoleApi): // API-User : All jobs
return query, nil
case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): // Admin & Support : All jobs
return query, nil
case user.HasRole(schema.RoleManager): // Manager : Add filter for managed projects' jobs only + personal jobs
if len(user.Projects) != 0 {
return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.hpc_user": user.Username}}), nil
} else {
log.Debugf("Manager-User '%s' has no defined projects to lookup! Query only personal jobs ...", user.Username)
return query.Where("job.hpc_user = ?", user.Username), nil
}
case user.HasRole(schema.RoleUser): // User : Only personal jobs
return query.Where("job.hpc_user = ?", user.Username), nil
default: // No known Role, return error
var qnil sq.SelectBuilder
return qnil, fmt.Errorf("user has no or unknown roles")
}
}
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
user := GetUserFromContext(ctx)
return SecurityCheckWithUser(user, query)
}
// Build a sq.SelectBuilder out of a schema.JobFilter.
func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
if filter.Tags != nil {
// This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query?
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct()
}
if filter.DbID != nil {
dbIDs := make([]string, len(filter.DbID))
for i, val := range filter.DbID {
dbIDs[i] = val
}
query = query.Where(sq.Eq{"job.id": dbIDs})
}
if filter.JobID != nil {
query = buildStringCondition("job.job_id", filter.JobID, query)
}
if filter.ArrayJobID != nil {
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
}
if filter.User != nil {
query = buildStringCondition("job.hpc_user", filter.User, query)
}
if filter.Project != nil {
query = buildStringCondition("job.project", filter.Project, query)
}
if filter.JobName != nil {
query = buildMetaJsonCondition("jobName", filter.JobName, query)
}
if filter.Cluster != nil {
query = buildStringCondition("job.cluster", filter.Cluster, query)
}
if filter.Partition != nil {
query = buildStringCondition("job.cluster_partition", filter.Partition, query)
}
if filter.StartTime != nil {
query = buildTimeCondition("job.start_time", filter.StartTime, query)
}
if filter.Duration != nil {
query = buildIntCondition("job.duration", filter.Duration, query)
}
if filter.MinRunningFor != nil {
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor)
}
if filter.Exclusive != nil {
query = query.Where("job.exclusive = ?", *filter.Exclusive)
}
if filter.State != nil {
states := make([]string, len(filter.State))
for i, val := range filter.State {
states[i] = string(val)
}
query = query.Where(sq.Eq{"job.job_state": states})
}
if filter.NumNodes != nil {
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
}
if filter.NumAccelerators != nil {
query = buildIntCondition("job.num_acc", filter.NumAccelerators, query)
}
if filter.NumHWThreads != nil {
query = buildIntCondition("job.num_hwthreads", filter.NumHWThreads, query)
}
if filter.Node != nil {
query = buildResourceJsonCondition("hostname", filter.Node, query)
}
if filter.Energy != nil {
query = buildFloatCondition("job.energy", filter.Energy, query)
}
if filter.MetricStats != nil {
for _, ms := range filter.MetricStats {
query = buildFloatJsonCondition(ms.MetricName, ms.Range, query)
}
}
return query
}
func buildIntCondition(field string, cond *schema.IntRange, query sq.SelectBuilder) sq.SelectBuilder {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
}
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
}
func buildTimeCondition(field string, cond *schema.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
if cond.From != nil && cond.To != nil {
return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix())
} else if cond.From != nil {
return query.Where("? <= "+field, cond.From.Unix())
} else if cond.To != nil {
return query.Where(field+" <= ?", cond.To.Unix())
} else if cond.Range != "" {
now := time.Now().Unix()
var then int64
switch cond.Range {
case "last6h":
then = now - (60 * 60 * 6)
case "last24h":
then = now - (60 * 60 * 24)
case "last7d":
then = now - (60 * 60 * 24 * 7)
case "last30d":
then = now - (60 * 60 * 24 * 30)
default:
log.Debugf("No known named timeRange: startTime.range = %s", cond.Range)
return query
}
return query.Where(field+" BETWEEN ? AND ?", then, now)
} else {
return query
}
}
func buildFloatJsonCondition(condName string, condRange *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
// Verify and Search Only in Valid Jsons
query = query.Where("JSON_VALID(footprint)")
return query.Where("JSON_EXTRACT(footprint, \"$."+condName+"\") BETWEEN ? AND ?", condRange.From, condRange.To)
}
func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
if cond.Eq != nil {
return query.Where(field+" = ?", *cond.Eq)
}
if cond.Neq != nil {
return query.Where(field+" != ?", *cond.Neq)
}
if cond.StartsWith != nil {
return query.Where(field+" LIKE ?", fmt.Sprint(*cond.StartsWith, "%"))
}
if cond.EndsWith != nil {
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.EndsWith))
}
if cond.Contains != nil {
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.Contains, "%"))
}
if cond.In != nil {
queryElements := make([]string, len(cond.In))
copy(queryElements, cond.In)
return query.Where(sq.Or{sq.Eq{field: queryElements}})
}
return query
}
func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
// Verify and Search Only in Valid Jsons
query = query.Where("JSON_VALID(meta_data)")
// add "AND" Sql query Block for field match
if cond.Eq != nil {
return query.Where("JSON_EXTRACT(meta_data, \"$."+jsonField+"\") = ?", *cond.Eq)
}
if cond.Neq != nil {
return query.Where("JSON_EXTRACT(meta_data, \"$."+jsonField+"\") != ?", *cond.Neq)
}
if cond.StartsWith != nil {
return query.Where("JSON_EXTRACT(meta_data, \"$."+jsonField+"\") LIKE ?", fmt.Sprint(*cond.StartsWith, "%"))
}
if cond.EndsWith != nil {
return query.Where("JSON_EXTRACT(meta_data, \"$."+jsonField+"\") LIKE ?", fmt.Sprint("%", *cond.EndsWith))
}
if cond.Contains != nil {
return query.Where("JSON_EXTRACT(meta_data, \"$."+jsonField+"\") LIKE ?", fmt.Sprint("%", *cond.Contains, "%"))
}
return query
}
func buildResourceJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
// Verify and Search Only in Valid Jsons
query = query.Where("JSON_VALID(resources)")
// add "AND" Sql query Block for field match
if cond.Eq != nil {
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") = ?)", *cond.Eq)
}
if cond.Neq != nil { // Currently Unused
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") != ?)", *cond.Neq)
}
if cond.StartsWith != nil { // Currently Unused
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\")) LIKE ?)", fmt.Sprint(*cond.StartsWith, "%"))
}
if cond.EndsWith != nil { // Currently Unused
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") LIKE ?)", fmt.Sprint("%", *cond.EndsWith))
}
if cond.Contains != nil {
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") LIKE ?)", fmt.Sprint("%", *cond.Contains, "%"))
}
return query
}
var (
matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)")
matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
)
func toSnakeCase(str string) string {
for _, c := range str {
if c == '\'' || c == '\\' {
log.Panic("toSnakeCase() attack vector!")
}
}
str = strings.ReplaceAll(str, "'", "")
str = strings.ReplaceAll(str, "\\", "")
snake := matchFirstCap.ReplaceAllString(str, "${1}_${2}")
snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}")
return strings.ToLower(snake)
}

View File

@ -5,9 +5,11 @@
package repository
import (
"context"
"fmt"
"testing"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
_ "github.com/mattn/go-sqlite3"
)
@ -28,8 +30,10 @@ func TestFind(t *testing.T) {
func TestFindById(t *testing.T) {
r := setup(t)
job, err := r.FindById(5)
noErr(t, err)
job, err := r.FindById(getContext(t), 5)
if err != nil {
t.Fatal(err)
}
// fmt.Printf("%+v", job)
@ -41,8 +45,22 @@ func TestFindById(t *testing.T) {
func TestGetTags(t *testing.T) {
r := setup(t)
tags, counts, err := r.CountTags(nil)
noErr(t, err)
const contextUserKey ContextKey = "user"
contextUserValue := &schema.User{
Username: "testuser",
Projects: make([]string, 0),
Roles: []string{"user"},
AuthType: 0,
AuthSource: 2,
}
ctx := context.WithValue(getContext(t), contextUserKey, contextUserValue)
// Test Tag has Scope "global"
tags, counts, err := r.CountTags(GetUserFromContext(ctx))
if err != nil {
t.Fatal(err)
}
fmt.Printf("TAGS %+v \n", tags)
// fmt.Printf("COUNTS %+v \n", counts)

View File

@ -16,7 +16,7 @@ import (
"github.com/golang-migrate/migrate/v4/source/iofs"
)
const Version uint = 7
const Version uint = 8
//go:embed migrations/*
var migrationFiles embed.FS
@ -54,7 +54,7 @@ func checkDBVersion(backend string, db *sql.DB) error {
return err
}
default:
log.Fatalf("unsupported database backend: %s", backend)
log.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
}
v, dirty, err := m.Version()
@ -102,7 +102,7 @@ func getMigrateInstance(backend string, db string) (m *migrate.Migrate, err erro
return m, err
}
default:
log.Fatalf("unsupported database backend: %s", backend)
log.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
}
return m, nil
@ -114,6 +114,14 @@ func MigrateDB(backend string, db string) error {
return err
}
v, dirty, err := m.Version()
log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version)
if dirty {
return fmt.Errorf("last migration to version %d has failed, please fix the db manually and force version with -force-db flag", Version)
}
if err := m.Up(); err != nil {
if err == migrate.ErrNoChange {
log.Info("DB already up to date!")

View File

@ -0,0 +1,83 @@
ALTER TABLE job DROP energy;
ALTER TABLE job DROP energy_footprint;
ALTER TABLE job ADD COLUMN flops_any_avg;
ALTER TABLE job ADD COLUMN mem_bw_avg;
ALTER TABLE job ADD COLUMN mem_used_max;
ALTER TABLE job ADD COLUMN load_avg;
ALTER TABLE job ADD COLUMN net_bw_avg;
ALTER TABLE job ADD COLUMN net_data_vol_total;
ALTER TABLE job ADD COLUMN file_bw_avg;
ALTER TABLE job ADD COLUMN file_data_vol_total;
UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
ALTER TABLE job DROP footprint;
-- Do not use reserved keywords anymore
RENAME TABLE hpc_user TO `user`;
ALTER TABLE job RENAME COLUMN hpc_user TO `user`;
ALTER TABLE job RENAME COLUMN cluster_partition TO `partition`;
DROP INDEX IF EXISTS jobs_cluster;
DROP INDEX IF EXISTS jobs_cluster_user;
DROP INDEX IF EXISTS jobs_cluster_project;
DROP INDEX IF EXISTS jobs_cluster_subcluster;
DROP INDEX IF EXISTS jobs_cluster_starttime;
DROP INDEX IF EXISTS jobs_cluster_duration;
DROP INDEX IF EXISTS jobs_cluster_numnodes;
DROP INDEX IF EXISTS jobs_cluster_partition;
DROP INDEX IF EXISTS jobs_cluster_partition_starttime;
DROP INDEX IF EXISTS jobs_cluster_partition_duration;
DROP INDEX IF EXISTS jobs_cluster_partition_numnodes;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes;
DROP INDEX IF EXISTS jobs_cluster_jobstate;
DROP INDEX IF EXISTS jobs_cluster_jobstate_user;
DROP INDEX IF EXISTS jobs_cluster_jobstate_project;
DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime;
DROP INDEX IF EXISTS jobs_cluster_jobstate_duration;
DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes;
DROP INDEX IF EXISTS jobs_user;
DROP INDEX IF EXISTS jobs_user_starttime;
DROP INDEX IF EXISTS jobs_user_duration;
DROP INDEX IF EXISTS jobs_user_numnodes;
DROP INDEX IF EXISTS jobs_project;
DROP INDEX IF EXISTS jobs_project_user;
DROP INDEX IF EXISTS jobs_project_starttime;
DROP INDEX IF EXISTS jobs_project_duration;
DROP INDEX IF EXISTS jobs_project_numnodes;
DROP INDEX IF EXISTS jobs_jobstate;
DROP INDEX IF EXISTS jobs_jobstate_user;
DROP INDEX IF EXISTS jobs_jobstate_project;
DROP INDEX IF EXISTS jobs_jobstate_starttime;
DROP INDEX IF EXISTS jobs_jobstate_duration;
DROP INDEX IF EXISTS jobs_jobstate_numnodes;
DROP INDEX IF EXISTS jobs_arrayjobid_starttime;
DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime;
DROP INDEX IF EXISTS jobs_starttime;
DROP INDEX IF EXISTS jobs_duration;
DROP INDEX IF EXISTS jobs_numnodes;
DROP INDEX IF EXISTS jobs_duration_starttime;
DROP INDEX IF EXISTS jobs_numnodes_starttime;
DROP INDEX IF EXISTS jobs_numacc_starttime;
DROP INDEX IF EXISTS jobs_energy_starttime;

View File

@ -0,0 +1,123 @@
DROP INDEX IF EXISTS job_stats ON job;
DROP INDEX IF EXISTS job_by_user ON job;
DROP INDEX IF EXISTS job_by_starttime ON job;
DROP INDEX IF EXISTS job_by_job_id ON job;
DROP INDEX IF EXISTS job_list ON job;
DROP INDEX IF EXISTS job_list_user ON job;
DROP INDEX IF EXISTS job_list_users ON job;
DROP INDEX IF EXISTS job_list_users_start ON job;
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
ALTER TABLE job ADD COLUMN energy_footprint JSON;
ALTER TABLE job ADD COLUMN footprint JSON;
ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global';
-- Do not use reserved keywords anymore
RENAME TABLE `user` TO hpc_user;
ALTER TABLE job RENAME COLUMN `user` TO hpc_user;
ALTER TABLE job RENAME COLUMN `partition` TO cluster_partition;
ALTER TABLE job MODIFY COLUMN cluster VARCHAR(50);
ALTER TABLE job MODIFY COLUMN hpc_user VARCHAR(50);
ALTER TABLE job MODIFY COLUMN subcluster VARCHAR(50);
ALTER TABLE job MODIFY COLUMN project VARCHAR(50);
ALTER TABLE job MODIFY COLUMN cluster_partition VARCHAR(50);
ALTER TABLE job MODIFY COLUMN job_state VARCHAR(25);
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg);
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
ALTER TABLE job DROP flops_any_avg;
ALTER TABLE job DROP mem_bw_avg;
ALTER TABLE job DROP mem_used_max;
ALTER TABLE job DROP load_avg;
ALTER TABLE job DROP net_bw_avg;
ALTER TABLE job DROP net_data_vol_total;
ALTER TABLE job DROP file_bw_avg;
ALTER TABLE job DROP file_data_vol_total;
-- Indices for: Single filters, combined filters, sorting, sorting with filters
-- Cluster Filter
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
-- Cluster Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
-- Cluster+Partition Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
-- Cluster+Partition Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
-- Cluster+Partition+Jobstate Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
-- Cluster+Partition+Jobstate Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
-- Cluster+JobState Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
-- Cluster+JobState Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
-- User Filter
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
-- User Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
-- Project Filter
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
-- Project Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
-- JobState Filter
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
-- JobState Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
-- ArrayJob Filter
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
-- Sorting without active filters
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
-- Single filters with default starttime sorting
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
-- Optimize DB index usage

View File

@ -0,0 +1,103 @@
ALTER TABLE job DROP energy;
ALTER TABLE job DROP energy_footprint;
ALTER TABLE job ADD COLUMN flops_any_avg;
ALTER TABLE job ADD COLUMN mem_bw_avg;
ALTER TABLE job ADD COLUMN mem_used_max;
ALTER TABLE job ADD COLUMN load_avg;
ALTER TABLE job ADD COLUMN net_bw_avg;
ALTER TABLE job ADD COLUMN net_data_vol_total;
ALTER TABLE job ADD COLUMN file_bw_avg;
ALTER TABLE job ADD COLUMN file_data_vol_total;
UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
ALTER TABLE job DROP footprint;
DROP INDEX IF EXISTS jobs_cluster;
DROP INDEX IF EXISTS jobs_cluster_user;
DROP INDEX IF EXISTS jobs_cluster_project;
DROP INDEX IF EXISTS jobs_cluster_subcluster;
DROP INDEX IF EXISTS jobs_cluster_starttime;
DROP INDEX IF EXISTS jobs_cluster_duration;
DROP INDEX IF EXISTS jobs_cluster_numnodes;
DROP INDEX IF EXISTS jobs_cluster_numhwthreads;
DROP INDEX IF EXISTS jobs_cluster_numacc;
DROP INDEX IF EXISTS jobs_cluster_energy;
DROP INDEX IF EXISTS jobs_cluster_partition;
DROP INDEX IF EXISTS jobs_cluster_partition_starttime;
DROP INDEX IF EXISTS jobs_cluster_partition_duration;
DROP INDEX IF EXISTS jobs_cluster_partition_numnodes;
DROP INDEX IF EXISTS jobs_cluster_partition_numhwthreads;
DROP INDEX IF EXISTS jobs_cluster_partition_numacc;
DROP INDEX IF EXISTS jobs_cluster_partition_energy;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numhwthreads;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numacc;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_energy;
DROP INDEX IF EXISTS jobs_cluster_jobstate;
DROP INDEX IF EXISTS jobs_cluster_jobstate_user;
DROP INDEX IF EXISTS jobs_cluster_jobstate_project;
DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime;
DROP INDEX IF EXISTS jobs_cluster_jobstate_duration;
DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes;
DROP INDEX IF EXISTS jobs_cluster_jobstate_numhwthreads;
DROP INDEX IF EXISTS jobs_cluster_jobstate_numacc;
DROP INDEX IF EXISTS jobs_cluster_jobstate_energy;
DROP INDEX IF EXISTS jobs_user;
DROP INDEX IF EXISTS jobs_user_starttime;
DROP INDEX IF EXISTS jobs_user_duration;
DROP INDEX IF EXISTS jobs_user_numnodes;
DROP INDEX IF EXISTS jobs_user_numhwthreads;
DROP INDEX IF EXISTS jobs_user_numacc;
DROP INDEX IF EXISTS jobs_user_energy;
DROP INDEX IF EXISTS jobs_project;
DROP INDEX IF EXISTS jobs_project_user;
DROP INDEX IF EXISTS jobs_project_starttime;
DROP INDEX IF EXISTS jobs_project_duration;
DROP INDEX IF EXISTS jobs_project_numnodes;
DROP INDEX IF EXISTS jobs_project_numhwthreads;
DROP INDEX IF EXISTS jobs_project_numacc;
DROP INDEX IF EXISTS jobs_project_energy;
DROP INDEX IF EXISTS jobs_jobstate;
DROP INDEX IF EXISTS jobs_jobstate_user;
DROP INDEX IF EXISTS jobs_jobstate_project;
DROP INDEX IF EXISTS jobs_jobstate_starttime;
DROP INDEX IF EXISTS jobs_jobstate_duration;
DROP INDEX IF EXISTS jobs_jobstate_numnodes;
DROP INDEX IF EXISTS jobs_jobstate_numhwthreads;
DROP INDEX IF EXISTS jobs_jobstate_numacc;
DROP INDEX IF EXISTS jobs_arrayjobid_starttime;
DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime;
DROP INDEX IF EXISTS jobs_starttime;
DROP INDEX IF EXISTS jobs_duration;
DROP INDEX IF EXISTS jobs_numnodes;
DROP INDEX IF EXISTS jobs_numhwthreads;
DROP INDEX IF EXISTS jobs_numacc;
DROP INDEX IF EXISTS jobs_energy;
DROP INDEX IF EXISTS jobs_duration_starttime;
DROP INDEX IF EXISTS jobs_numnodes_starttime;
DROP INDEX IF EXISTS jobs_numhwthreads_starttime;
DROP INDEX IF EXISTS jobs_numacc_starttime;
DROP INDEX IF EXISTS jobs_energy_starttime;

View File

@ -0,0 +1,142 @@
DROP INDEX IF EXISTS job_stats;
DROP INDEX IF EXISTS job_by_user;
DROP INDEX IF EXISTS job_by_starttime;
DROP INDEX IF EXISTS job_by_job_id;
DROP INDEX IF EXISTS job_list;
DROP INDEX IF EXISTS job_list_user;
DROP INDEX IF EXISTS job_list_users;
DROP INDEX IF EXISTS job_list_users_start;
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
ALTER TABLE job ADD COLUMN energy_footprint TEXT DEFAULT NULL;
ALTER TABLE job ADD COLUMN footprint TEXT DEFAULT NULL;
ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global';
-- Do not use reserved keywords anymore
ALTER TABLE "user" RENAME TO hpc_user;
ALTER TABLE job RENAME COLUMN "user" TO hpc_user;
ALTER TABLE job RENAME COLUMN "partition" TO cluster_partition;
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg);
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
ALTER TABLE job DROP flops_any_avg;
ALTER TABLE job DROP mem_bw_avg;
ALTER TABLE job DROP mem_used_max;
ALTER TABLE job DROP load_avg;
ALTER TABLE job DROP net_bw_avg;
ALTER TABLE job DROP net_data_vol_total;
ALTER TABLE job DROP file_bw_avg;
ALTER TABLE job DROP file_data_vol_total;
-- Indices for: Single filters, combined filters, sorting, sorting with filters
-- Cluster Filter
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
-- Cluster Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_cluster_numhwthreads ON job (cluster, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_cluster_numacc ON job (cluster, num_acc);
CREATE INDEX IF NOT EXISTS jobs_cluster_energy ON job (cluster, energy);
-- Cluster+Partition Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
-- Cluster+Partition Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numhwthreads ON job (cluster, cluster_partition, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numacc ON job (cluster, cluster_partition, num_acc);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_energy ON job (cluster, cluster_partition, energy);
-- Cluster+Partition+Jobstate Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
-- Cluster+Partition+Jobstate Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numhwthreads ON job (cluster, cluster_partition, job_state, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numacc ON job (cluster, cluster_partition, job_state, num_acc);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_energy ON job (cluster, cluster_partition, job_state, energy);
-- Cluster+JobState Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
-- Cluster+JobState Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numhwthreads ON job (cluster, job_state, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numacc ON job (cluster, job_state, num_acc);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_energy ON job (cluster, job_state, energy);
-- User Filter
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
-- User Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_user_numhwthreads ON job (hpc_user, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_user_numacc ON job (hpc_user, num_acc);
CREATE INDEX IF NOT EXISTS jobs_user_energy ON job (hpc_user, energy);
-- Project Filter
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
-- Project Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_project_numhwthreads ON job (project, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_project_numacc ON job (project, num_acc);
CREATE INDEX IF NOT EXISTS jobs_project_energy ON job (project, energy);
-- JobState Filter
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
-- JobState Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_jobstate_numhwthreads ON job (job_state, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_jobstate_numacc ON job (job_state, num_acc);
CREATE INDEX IF NOT EXISTS jobs_jobstate_energy ON job (job_state, energy);
-- ArrayJob Filter
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
-- Sorting without active filters
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
CREATE INDEX IF NOT EXISTS jobs_numhwthreads ON job (num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_numacc ON job (num_acc);
CREATE INDEX IF NOT EXISTS jobs_energy ON job (energy);
-- Single filters with default starttime sorting
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
CREATE INDEX IF NOT EXISTS jobs_numhwthreads_starttime ON job (num_hwthreads, start_time);
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
-- Optimize DB index usage
PRAGMA optimize;

View File

@ -1,253 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"context"
"errors"
"fmt"
"regexp"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
)
func (r *JobRepository) QueryJobs(
ctx context.Context,
filters []*model.JobFilter,
page *model.PageRequest,
order *model.OrderByInput) ([]*schema.Job, error) {
query, qerr := SecurityCheck(ctx, sq.Select(jobColumns...).From("job"))
if qerr != nil {
return nil, qerr
}
if order != nil {
field := toSnakeCase(order.Field)
switch order.Order {
case model.SortDirectionEnumAsc:
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
case model.SortDirectionEnumDesc:
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
default:
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order")
}
}
if page != nil && page.ItemsPerPage != -1 {
limit := uint64(page.ItemsPerPage)
query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit)
}
for _, f := range filters {
query = BuildWhereClause(f, query)
}
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
log.Errorf("Error while running query: %v", err)
return nil, err
}
jobs := make([]*schema.Job, 0, 50)
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
rows.Close()
log.Warn("Error while scanning rows (Jobs)")
return nil, err
}
jobs = append(jobs, job)
}
return jobs, nil
}
func (r *JobRepository) CountJobs(
ctx context.Context,
filters []*model.JobFilter) (int, error) {
query, qerr := SecurityCheck(ctx, sq.Select("count(*)").From("job"))
if qerr != nil {
return 0, qerr
}
for _, f := range filters {
query = BuildWhereClause(f, query)
}
var count int
if err := query.RunWith(r.DB).Scan(&count); err != nil {
return 0, err
}
return count, nil
}
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
user := GetUserFromContext(ctx)
if user == nil {
var qnil sq.SelectBuilder
return qnil, fmt.Errorf("user context is nil")
} else if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport, schema.RoleApi}) { // Admin & Co. : All jobs
return query, nil
} else if user.HasRole(schema.RoleManager) { // Manager : Add filter for managed projects' jobs only + personal jobs
if len(user.Projects) != 0 {
return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.user": user.Username}}), nil
} else {
log.Debugf("Manager-User '%s' has no defined projects to lookup! Query only personal jobs ...", user.Username)
return query.Where("job.user = ?", user.Username), nil
}
} else if user.HasRole(schema.RoleUser) { // User : Only personal jobs
return query.Where("job.user = ?", user.Username), nil
} else {
// Shortterm compatibility: Return User-Query if no roles:
return query.Where("job.user = ?", user.Username), nil
// // On the longterm: Return Error instead of fallback:
// var qnil sq.SelectBuilder
// return qnil, fmt.Errorf("user '%s' with unknown roles [%#v]", user.Username, user.Roles)
}
}
// Build a sq.SelectBuilder out of a schema.JobFilter.
func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
if filter.Tags != nil {
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags})
}
if filter.JobID != nil {
query = buildStringCondition("job.job_id", filter.JobID, query)
}
if filter.ArrayJobID != nil {
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
}
if filter.User != nil {
query = buildStringCondition("job.user", filter.User, query)
}
if filter.Project != nil {
query = buildStringCondition("job.project", filter.Project, query)
}
if filter.JobName != nil {
query = buildStringCondition("job.meta_data", filter.JobName, query)
}
if filter.Cluster != nil {
query = buildStringCondition("job.cluster", filter.Cluster, query)
}
if filter.Partition != nil {
query = buildStringCondition("job.partition", filter.Partition, query)
}
if filter.StartTime != nil {
query = buildTimeCondition("job.start_time", filter.StartTime, query)
}
if filter.Duration != nil {
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
query = query.Where("(CASE WHEN job.job_state = 'running' THEN (? - job.start_time) ELSE job.duration END) BETWEEN ? AND ?", now, filter.Duration.From, filter.Duration.To)
}
if filter.MinRunningFor != nil {
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor)
}
if filter.State != nil {
states := make([]string, len(filter.State))
for i, val := range filter.State {
states[i] = string(val)
}
query = query.Where(sq.Eq{"job.job_state": states})
}
if filter.NumNodes != nil {
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
}
if filter.NumAccelerators != nil {
query = buildIntCondition("job.num_acc", filter.NumAccelerators, query)
}
if filter.NumHWThreads != nil {
query = buildIntCondition("job.num_hwthreads", filter.NumHWThreads, query)
}
if filter.Node != nil {
query = buildStringCondition("job.resources", filter.Node, query)
}
if filter.FlopsAnyAvg != nil {
query = buildFloatCondition("job.flops_any_avg", filter.FlopsAnyAvg, query)
}
if filter.MemBwAvg != nil {
query = buildFloatCondition("job.mem_bw_avg", filter.MemBwAvg, query)
}
if filter.LoadAvg != nil {
query = buildFloatCondition("job.load_avg", filter.LoadAvg, query)
}
if filter.MemUsedMax != nil {
query = buildFloatCondition("job.mem_used_max", filter.MemUsedMax, query)
}
return query
}
func buildIntCondition(field string, cond *schema.IntRange, query sq.SelectBuilder) sq.SelectBuilder {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
}
func buildTimeCondition(field string, cond *schema.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
if cond.From != nil && cond.To != nil {
return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix())
} else if cond.From != nil {
return query.Where("? <= "+field, cond.From.Unix())
} else if cond.To != nil {
return query.Where(field+" <= ?", cond.To.Unix())
} else {
return query
}
}
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
}
func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
if cond.Eq != nil {
return query.Where(field+" = ?", *cond.Eq)
}
if cond.Neq != nil {
return query.Where(field+" != ?", *cond.Neq)
}
if cond.StartsWith != nil {
return query.Where(field+" LIKE ?", fmt.Sprint(*cond.StartsWith, "%"))
}
if cond.EndsWith != nil {
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.EndsWith))
}
if cond.Contains != nil {
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.Contains, "%"))
}
if cond.In != nil {
queryElements := make([]string, len(cond.In))
for i, val := range cond.In {
queryElements[i] = val
}
return query.Where(sq.Or{sq.Eq{field: queryElements}})
}
return query
}
var matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)")
var matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
func toSnakeCase(str string) string {
for _, c := range str {
if c == '\'' || c == '\\' {
log.Panic("toSnakeCase() attack vector!")
}
}
str = strings.ReplaceAll(str, "'", "")
str = strings.ReplaceAll(str, "\\", "")
snake := matchFirstCap.ReplaceAllString(str, "${1}_${2}")
snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}")
return strings.ToLower(snake)
}

View File

@ -55,7 +55,7 @@ func BenchmarkDB_FindJobById(b *testing.B) {
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
_, err := db.FindById(jobId)
_, err := db.FindById(getContext(b), jobId)
noErr(b, err)
}
})
@ -111,7 +111,7 @@ func BenchmarkDB_QueryJobs(b *testing.B) {
user := "mppi133h"
filter.User = &model.StringInput{Eq: &user}
page := &model.PageRequest{ItemsPerPage: 50, Page: 1}
order := &model.OrderByInput{Field: "startTime", Order: model.SortDirectionEnumDesc}
order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc}
b.Run("QueryJobs", func(b *testing.B) {
db := setup(b)

View File

@ -8,12 +8,11 @@ import (
"context"
"database/sql"
"fmt"
"math"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
@ -22,7 +21,7 @@ import (
// GraphQL validation should make sure that no unkown values can be specified.
var groupBy2column = map[model.Aggregate]string{
model.AggregateUser: "job.user",
model.AggregateUser: "job.hpc_user",
model.AggregateProject: "job.project",
model.AggregateCluster: "job.cluster",
}
@ -41,8 +40,8 @@ var sortBy2column = map[model.SortByAggregate]string{
func (r *JobRepository) buildCountQuery(
filter []*model.JobFilter,
kind string,
col string) sq.SelectBuilder {
col string,
) sq.SelectBuilder {
var query sq.SelectBuilder
if col != "" {
@ -69,16 +68,16 @@ func (r *JobRepository) buildCountQuery(
func (r *JobRepository) buildStatsQuery(
filter []*model.JobFilter,
col string) sq.SelectBuilder {
col string,
) sq.SelectBuilder {
var query sq.SelectBuilder
castType := r.getCastType()
// fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as %s) as value`, time.Now().Unix(), castType)
if col != "" {
// Scan columns: id, totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
query = sq.Select(col, "COUNT(job.id) as totalJobs",
// Scan columns: id, totalJobs, name, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
query = sq.Select(col, "COUNT(job.id) as totalJobs", "name",
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s) as totalWalltime`, time.Now().Unix(), castType),
fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s) as totalNodes`, castType),
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as %s) as totalNodeHours`, time.Now().Unix(), castType),
@ -86,10 +85,9 @@ func (r *JobRepository) buildStatsQuery(
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as %s) as totalCoreHours`, time.Now().Unix(), castType),
fmt.Sprintf(`CAST(SUM(job.num_acc) as %s) as totalAccs`, castType),
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as %s) as totalAccHours`, time.Now().Unix(), castType),
).From("job").GroupBy(col)
).From("job").LeftJoin("hpc_user ON hpc_user.username = job.hpc_user").GroupBy(col)
} else {
// Scan columns: totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
// Scan columns: totalJobs, name, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
query = sq.Select("COUNT(job.id)",
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s)`, time.Now().Unix(), castType),
fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s)`, castType),
@ -108,15 +106,15 @@ func (r *JobRepository) buildStatsQuery(
return query
}
func (r *JobRepository) getUserName(ctx context.Context, id string) string {
user := GetUserFromContext(ctx)
name, _ := r.FindColumnValue(user, id, "user", "name", "username", false)
if name != "" {
return name
} else {
return "-"
}
}
// func (r *JobRepository) getUserName(ctx context.Context, id string) string {
// user := GetUserFromContext(ctx)
// name, _ := r.FindColumnValue(user, id, "hpc_user", "name", "username", false)
// if name != "" {
// return name
// } else {
// return "-"
// }
// }
func (r *JobRepository) getCastType() string {
var castType string
@ -138,8 +136,8 @@ func (r *JobRepository) JobsStatsGrouped(
filter []*model.JobFilter,
page *model.PageRequest,
sortBy *model.SortByAggregate,
groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
groupBy *model.Aggregate,
) ([]*model.JobsStatistics, error) {
start := time.Now()
col := groupBy2column[*groupBy]
query := r.buildStatsQuery(filter, col)
@ -168,14 +166,20 @@ func (r *JobRepository) JobsStatsGrouped(
for rows.Next() {
var id sql.NullString
var name sql.NullString
var jobs, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64
if err := rows.Scan(&id, &jobs, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil {
if err := rows.Scan(&id, &jobs, &name, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
if id.Valid {
var totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours int
var personName string
if name.Valid {
personName = name.String
}
if jobs.Valid {
totalJobs = int(jobs.Int64)
@ -205,12 +209,12 @@ func (r *JobRepository) JobsStatsGrouped(
totalAccHours = int(accHours.Int64)
}
if col == "job.user" {
name := r.getUserName(ctx, id.String)
if col == "job.hpc_user" {
// name := r.getUserName(ctx, id.String)
stats = append(stats,
&model.JobsStatistics{
ID: id.String,
Name: name,
Name: personName,
TotalJobs: totalJobs,
TotalWalltime: totalWalltime,
TotalNodes: totalNodes,
@ -218,7 +222,8 @@ func (r *JobRepository) JobsStatsGrouped(
TotalCores: totalCores,
TotalCoreHours: totalCoreHours,
TotalAccs: totalAccs,
TotalAccHours: totalAccHours})
TotalAccHours: totalAccHours,
})
} else {
stats = append(stats,
&model.JobsStatistics{
@ -230,7 +235,8 @@ func (r *JobRepository) JobsStatsGrouped(
TotalCores: totalCores,
TotalCoreHours: totalCoreHours,
TotalAccs: totalAccs,
TotalAccHours: totalAccHours})
TotalAccHours: totalAccHours,
})
}
}
}
@ -241,8 +247,8 @@ func (r *JobRepository) JobsStatsGrouped(
func (r *JobRepository) JobsStats(
ctx context.Context,
filter []*model.JobFilter) ([]*model.JobsStatistics, error) {
filter []*model.JobFilter,
) ([]*model.JobsStatistics, error) {
start := time.Now()
query := r.buildStatsQuery(filter, "")
query, err := SecurityCheck(ctx, query)
@ -277,18 +283,36 @@ func (r *JobRepository) JobsStats(
TotalWalltime: int(walltime.Int64),
TotalNodeHours: totalNodeHours,
TotalCoreHours: totalCoreHours,
TotalAccHours: totalAccHours})
TotalAccHours: totalAccHours,
})
}
log.Debugf("Timer JobStats %s", time.Since(start))
return stats, nil
}
func LoadJobStat(job *schema.JobMeta, metric string, statType string) float64 {
if stats, ok := job.Statistics[metric]; ok {
switch statType {
case "avg":
return stats.Avg
case "max":
return stats.Max
case "min":
return stats.Min
default:
log.Errorf("Unknown stat type %s", statType)
}
}
return 0.0
}
func (r *JobRepository) JobCountGrouped(
ctx context.Context,
filter []*model.JobFilter,
groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
groupBy *model.Aggregate,
) ([]*model.JobsStatistics, error) {
start := time.Now()
col := groupBy2column[*groupBy]
query := r.buildCountQuery(filter, "", col)
@ -315,7 +339,8 @@ func (r *JobRepository) JobCountGrouped(
stats = append(stats,
&model.JobsStatistics{
ID: id.String,
TotalJobs: int(cnt.Int64)})
TotalJobs: int(cnt.Int64),
})
}
}
@ -328,8 +353,8 @@ func (r *JobRepository) AddJobCountGrouped(
filter []*model.JobFilter,
groupBy *model.Aggregate,
stats []*model.JobsStatistics,
kind string) ([]*model.JobsStatistics, error) {
kind string,
) ([]*model.JobsStatistics, error) {
start := time.Now()
col := groupBy2column[*groupBy]
query := r.buildCountQuery(filter, kind, col)
@ -376,8 +401,8 @@ func (r *JobRepository) AddJobCount(
ctx context.Context,
filter []*model.JobFilter,
stats []*model.JobsStatistics,
kind string) ([]*model.JobsStatistics, error) {
kind string,
) ([]*model.JobsStatistics, error) {
start := time.Now()
query := r.buildCountQuery(filter, kind, "")
query, err := SecurityCheck(ctx, query)
@ -420,15 +445,41 @@ func (r *JobRepository) AddJobCount(
func (r *JobRepository) AddHistograms(
ctx context.Context,
filter []*model.JobFilter,
stat *model.JobsStatistics) (*model.JobsStatistics, error) {
stat *model.JobsStatistics,
durationBins *string,
) (*model.JobsStatistics, error) {
start := time.Now()
var targetBinCount int
var targetBinSize int
switch {
case *durationBins == "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes
targetBinCount = 60
targetBinSize = 60
case *durationBins == "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours
targetBinCount = 72
targetBinSize = 600
case *durationBins == "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours
targetBinCount = 48
targetBinSize = 3600
case *durationBins == "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days
targetBinCount = 12
targetBinSize = 21600
case *durationBins == "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days
targetBinCount = 14
targetBinSize = 43200
default: // 24h
targetBinCount = 24
targetBinSize = 3600
}
castType := r.getCastType()
var err error
value := fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as %s) as value`, time.Now().Unix(), castType)
stat.HistDuration, err = r.jobsStatisticsHistogram(ctx, value, filter)
// Return X-Values always as seconds, will be formatted into minutes and hours in frontend
value := fmt.Sprintf(`CAST(ROUND(((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / %d) + 1) as %s) as value`, time.Now().Unix(), targetBinSize, castType)
stat.HistDuration, err = r.jobsDurationStatisticsHistogram(ctx, value, filter, targetBinSize, &targetBinCount)
if err != nil {
log.Warn("Error while loading job statistics histogram: running jobs")
log.Warn("Error while loading job statistics histogram: job duration")
return nil, err
}
@ -459,14 +510,16 @@ func (r *JobRepository) AddMetricHistograms(
ctx context.Context,
filter []*model.JobFilter,
metrics []string,
stat *model.JobsStatistics) (*model.JobsStatistics, error) {
stat *model.JobsStatistics,
targetBinCount *int,
) (*model.JobsStatistics, error) {
start := time.Now()
// Running Jobs Only: First query jobdata from sqlite, then query data and make bins
for _, f := range filter {
if f.State != nil {
if len(f.State) == 1 && f.State[0] == "running" {
stat.HistMetrics = r.runningJobsMetricStatisticsHistogram(ctx, metrics, filter)
stat.HistMetrics = r.runningJobsMetricStatisticsHistogram(ctx, metrics, filter, targetBinCount)
log.Debugf("Timer AddMetricHistograms %s", time.Since(start))
return stat, nil
}
@ -475,7 +528,7 @@ func (r *JobRepository) AddMetricHistograms(
// All other cases: Query and make bins in sqlite directly
for _, m := range metrics {
metricHisto, err := r.jobsMetricStatisticsHistogram(ctx, m, filter)
metricHisto, err := r.jobsMetricStatisticsHistogram(ctx, m, filter, targetBinCount)
if err != nil {
log.Warnf("Error while loading job metric statistics histogram: %s", m)
continue
@ -491,8 +544,8 @@ func (r *JobRepository) AddMetricHistograms(
func (r *JobRepository) jobsStatisticsHistogram(
ctx context.Context,
value string,
filters []*model.JobFilter) ([]*model.HistoPoint, error) {
filters []*model.JobFilter,
) ([]*model.HistoPoint, error) {
start := time.Now()
query, qerr := SecurityCheck(ctx,
sq.Select(value, "COUNT(job.id) AS count").From("job"))
@ -512,6 +565,7 @@ func (r *JobRepository) jobsStatisticsHistogram(
}
points := make([]*model.HistoPoint, 0)
// is it possible to introduce zero values here? requires info about bincount
for rows.Next() {
point := model.HistoPoint{}
if err := rows.Scan(&point.Value, &point.Count); err != nil {
@ -525,39 +579,79 @@ func (r *JobRepository) jobsStatisticsHistogram(
return points, nil
}
func (r *JobRepository) jobsDurationStatisticsHistogram(
ctx context.Context,
value string,
filters []*model.JobFilter,
binSizeSeconds int,
targetBinCount *int,
) ([]*model.HistoPoint, error) {
start := time.Now()
query, qerr := SecurityCheck(ctx,
sq.Select(value, "COUNT(job.id) AS count").From("job"))
if qerr != nil {
return nil, qerr
}
// Setup Array
points := make([]*model.HistoPoint, 0)
for i := 1; i <= *targetBinCount; i++ {
point := model.HistoPoint{Value: i * binSizeSeconds, Count: 0}
points = append(points, &point)
}
for _, f := range filters {
query = BuildWhereClause(f, query)
}
rows, err := query.GroupBy("value").RunWith(r.DB).Query()
if err != nil {
log.Error("Error while running query")
return nil, err
}
// Fill Array at matching $Value
for rows.Next() {
point := model.HistoPoint{}
if err := rows.Scan(&point.Value, &point.Count); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
for _, e := range points {
if e.Value == (point.Value * binSizeSeconds) {
// Note:
// Matching on unmodified integer value (and multiplying point.Value by binSizeSeconds after match)
// causes frontend to loop into highest targetBinCount, due to zoom condition instantly being fullfilled (cause unknown)
e.Count = point.Count
break
}
}
}
log.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start))
return points, nil
}
func (r *JobRepository) jobsMetricStatisticsHistogram(
ctx context.Context,
metric string,
filters []*model.JobFilter) (*model.MetricHistoPoints, error) {
var dbMetric string
switch metric {
case "cpu_load":
dbMetric = "load_avg"
case "flops_any":
dbMetric = "flops_any_avg"
case "mem_bw":
dbMetric = "mem_bw_avg"
case "mem_used":
dbMetric = "mem_used_max"
case "net_bw":
dbMetric = "net_bw_avg"
case "file_bw":
dbMetric = "file_bw_avg"
default:
return nil, fmt.Errorf("%s not implemented", metric)
}
filters []*model.JobFilter,
bins *int,
) (*model.MetricHistoPoints, error) {
// Get specific Peak or largest Peak
var metricConfig *schema.MetricConfig
var peak float64 = 0.0
var unit string = ""
var peak float64
var unit string
var footprintStat string
for _, f := range filters {
if f.Cluster != nil {
metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric)
peak = metricConfig.Peak
unit = metricConfig.Unit.Prefix + metricConfig.Unit.Base
footprintStat = metricConfig.Footprint
log.Debugf("Cluster %s filter found with peak %f for %s", *f.Cluster.Eq, peak, metric)
}
}
@ -572,58 +666,40 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
if unit == "" {
unit = m.Unit.Prefix + m.Unit.Base
}
if footprintStat == "" {
footprintStat = m.Footprint
}
}
}
}
}
// log.Debugf("Metric %s: DB %s, Peak %f, Unit %s", metric, dbMetric, peak, unit)
// Make bins, see https://jereze.com/code/sql-histogram/
// log.Debugf("Metric %s, Peak %f, Unit %s", metric, peak, unit)
// Make bins, see https://jereze.com/code/sql-histogram/ (Modified here)
start := time.Now()
crossJoinQuery := sq.Select(
fmt.Sprintf(`max(%s) as max`, dbMetric),
fmt.Sprintf(`min(%s) as min`, dbMetric),
).From("job").Where(
fmt.Sprintf(`%s is not null`, dbMetric),
).Where(
fmt.Sprintf(`%s <= %f`, dbMetric, peak),
)
crossJoinQuery, cjqerr := SecurityCheck(ctx, crossJoinQuery)
if cjqerr != nil {
return nil, cjqerr
}
for _, f := range filters {
crossJoinQuery = BuildWhereClause(f, crossJoinQuery)
}
crossJoinQuerySql, crossJoinQueryArgs, sqlerr := crossJoinQuery.ToSql()
if sqlerr != nil {
return nil, sqlerr
}
bins := 10
binQuery := fmt.Sprintf(`CAST( (case when job.%s = value.max then value.max*0.999999999 else job.%s end - value.min) / (value.max - value.min) * %d as INTEGER )`, dbMetric, dbMetric, bins)
// Find Jobs' Value Bin Number: Divide Value by Peak, Multiply by RequestedBins, then CAST to INT: Gets Bin-Number of Job
binQuery := fmt.Sprintf(`CAST(
((case when json_extract(footprint, "$.%s") = %f then %f*0.999999999 else json_extract(footprint, "$.%s") end) / %f)
* %v as INTEGER )`,
(metric + "_" + footprintStat), peak, peak, (metric + "_" + footprintStat), peak, *bins)
mainQuery := sq.Select(
fmt.Sprintf(`%s + 1 as bin`, binQuery),
fmt.Sprintf(`count(job.%s) as count`, dbMetric),
fmt.Sprintf(`CAST(((value.max / %d) * (%s )) as INTEGER ) as min`, bins, binQuery),
fmt.Sprintf(`CAST(((value.max / %d) * (%s + 1 )) as INTEGER ) as max`, bins, binQuery),
).From("job").CrossJoin(
fmt.Sprintf(`(%s) as value`, crossJoinQuerySql), crossJoinQueryArgs...,
).Where(fmt.Sprintf(`job.%s is not null and job.%s <= %f`, dbMetric, dbMetric, peak))
fmt.Sprintf(`count(*) as count`),
// For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * %s as min`, peak, *bins, binQuery),
// For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * (%s + 1) as max`, peak, *bins, binQuery),
).From("job").Where(
"JSON_VALID(footprint)",
).Where(fmt.Sprintf(`json_extract(footprint, "$.%s") is not null and json_extract(footprint, "$.%s") <= %f`, (metric + "_" + footprintStat), (metric + "_" + footprintStat), peak))
// Only accessible Jobs...
mainQuery, qerr := SecurityCheck(ctx, mainQuery)
if qerr != nil {
return nil, qerr
}
// Filters...
for _, f := range filters {
mainQuery = BuildWhereClause(f, mainQuery)
}
@ -637,18 +713,41 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
return nil, err
}
// Setup Return Array With Bin-Numbers for Match and Min/Max based on Peak
points := make([]*model.MetricHistoPoint, 0)
for rows.Next() {
point := model.MetricHistoPoint{}
if err := rows.Scan(&point.Bin, &point.Count, &point.Min, &point.Max); err != nil {
log.Warnf("Error while scanning rows for %s", metric)
return nil, err // Totally bricks cc-backend if returned and if all metrics requested?
}
points = append(points, &point)
binStep := int(peak) / *bins
for i := 1; i <= *bins; i++ {
binMin := (binStep * (i - 1))
binMax := (binStep * i)
epoint := model.MetricHistoPoint{Bin: &i, Count: 0, Min: &binMin, Max: &binMax}
points = append(points, &epoint)
}
result := model.MetricHistoPoints{Metric: metric, Unit: unit, Data: points}
for rows.Next() { // Fill Count if Bin-No. Matches (Not every Bin exists in DB!)
rpoint := model.MetricHistoPoint{}
if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil { // Required for Debug: &rpoint.Min, &rpoint.Max
log.Warnf("Error while scanning rows for %s", metric)
return nil, err // FIXME: Totally bricks cc-backend if returned and if all metrics requested?
}
for _, e := range points {
if e.Bin != nil && rpoint.Bin != nil {
if *e.Bin == *rpoint.Bin {
e.Count = rpoint.Count
// Only Required For Debug: Check DB returned Min/Max against Backend Init above
// if rpoint.Min != nil {
// log.Warnf(">>>> Bin %d Min Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Min, *e.Min)
// }
// if rpoint.Max != nil {
// log.Warnf(">>>> Bin %d Max Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Max, *e.Max)
// }
break
}
}
}
}
result := model.MetricHistoPoints{Metric: metric, Unit: unit, Stat: &footprintStat, Data: points}
log.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start))
return &result, nil
@ -657,7 +756,9 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
func (r *JobRepository) runningJobsMetricStatisticsHistogram(
ctx context.Context,
metrics []string,
filters []*model.JobFilter) []*model.MetricHistoPoints {
filters []*model.JobFilter,
bins *int,
) []*model.MetricHistoPoints {
// Get Jobs
jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil)
@ -681,7 +782,7 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram(
continue
}
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
log.Errorf("Error while loading averages for histogram: %s", err)
return nil
}
@ -692,15 +793,14 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram(
for idx, metric := range metrics {
// Get specific Peak or largest Peak
var metricConfig *schema.MetricConfig
var peak float64 = 0.0
var unit string = ""
var peak float64
var unit string
for _, f := range filters {
if f.Cluster != nil {
metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric)
peak = metricConfig.Peak
unit = metricConfig.Unit.Prefix + metricConfig.Unit.Base
log.Debugf("Cluster %s filter found with peak %f for %s", *f.Cluster.Eq, peak, metric)
}
}
@ -720,28 +820,24 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram(
}
// Make and fill bins
bins := 10.0
peakBin := peak / bins
peakBin := int(peak) / *bins
points := make([]*model.MetricHistoPoint, 0)
for b := 0; b < 10; b++ {
for b := 0; b < *bins; b++ {
count := 0
bindex := b + 1
bmin := math.Round(peakBin * float64(b))
bmax := math.Round(peakBin * (float64(b) + 1.0))
bmin := peakBin * b
bmax := peakBin * (b + 1)
// Iterate AVG values for indexed metric and count for bins
for _, val := range avgs[idx] {
if float64(val) >= bmin && float64(val) < bmax {
if int(val) >= bmin && int(val) < bmax {
count += 1
}
}
bminint := int(bmin)
bmaxint := int(bmax)
// Append Bin to Metric Result Array
point := model.MetricHistoPoint{Bin: &bindex, Count: count, Min: &bminint, Max: &bmaxint}
point := model.MetricHistoPoint{Bin: &bindex, Count: count, Min: &bmin, Max: &bmax}
points = append(points, &point)
}

View File

@ -5,6 +5,7 @@
package repository
import (
"fmt"
"strings"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
@ -14,7 +15,13 @@ import (
)
// Add the tag with id `tagId` to the job with the database id `jobId`.
func (r *JobRepository) AddTag(job int64, tag int64) ([]*schema.Tag, error) {
func (r *JobRepository) AddTag(user *schema.User, job int64, tag int64) ([]*schema.Tag, error) {
j, err := r.FindByIdWithUser(user, job)
if err != nil {
log.Warn("Error while finding job by id")
return nil, err
}
q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(job, tag)
if _, err := q.RunWith(r.stmtCache).Exec(); err != nil {
@ -23,49 +30,153 @@ func (r *JobRepository) AddTag(job int64, tag int64) ([]*schema.Tag, error) {
return nil, err
}
j, err := r.FindById(job)
if err != nil {
log.Warn("Error while finding job by id")
return nil, err
}
tags, err := r.GetTags(&job)
tags, err := r.GetTags(user, &job)
if err != nil {
log.Warn("Error while getting tags for job")
return nil, err
}
return tags, archive.UpdateTags(j, tags)
archiveTags, err := r.getArchiveTags(&job)
if err != nil {
log.Warn("Error while getting tags for job")
return nil, err
}
return tags, archive.UpdateTags(j, archiveTags)
}
// Removes a tag from a job
func (r *JobRepository) RemoveTag(job, tag int64) ([]*schema.Tag, error) {
// Removes a tag from a job by tag id
func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) {
j, err := r.FindByIdWithUser(user, job)
if err != nil {
log.Warn("Error while finding job by id")
return nil, err
}
q := sq.Delete("jobtag").Where("jobtag.job_id = ?", job).Where("jobtag.tag_id = ?", tag)
if _, err := q.RunWith(r.stmtCache).Exec(); err != nil {
s, _, _ := q.ToSql()
log.Errorf("Error adding tag with %s: %v", s, err)
log.Errorf("Error removing tag with %s: %v", s, err)
return nil, err
}
j, err := r.FindById(job)
if err != nil {
log.Warn("Error while finding job by id")
return nil, err
}
tags, err := r.GetTags(&job)
tags, err := r.GetTags(user, &job)
if err != nil {
log.Warn("Error while getting tags for job")
return nil, err
}
return tags, archive.UpdateTags(j, tags)
archiveTags, err := r.getArchiveTags(&job)
if err != nil {
log.Warn("Error while getting tags for job")
return nil, err
}
return tags, archive.UpdateTags(j, archiveTags)
}
// Removes a tag from a job by tag info
func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagType string, tagName string, tagScope string) ([]*schema.Tag, error) {
// Get Tag ID to delete
tagID, exists := r.TagId(tagType, tagName, tagScope)
if !exists {
log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope)
return nil, fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope)
}
// Get Job
j, err := r.FindByIdWithUser(user, job)
if err != nil {
log.Warn("Error while finding job by id")
return nil, err
}
// Handle Delete
q := sq.Delete("jobtag").Where("jobtag.job_id = ?", job).Where("jobtag.tag_id = ?", tagID)
if _, err := q.RunWith(r.stmtCache).Exec(); err != nil {
s, _, _ := q.ToSql()
log.Errorf("Error removing tag from table 'jobTag' with %s: %v", s, err)
return nil, err
}
tags, err := r.GetTags(user, &job)
if err != nil {
log.Warn("Error while getting tags for job")
return nil, err
}
archiveTags, err := r.getArchiveTags(&job)
if err != nil {
log.Warn("Error while getting tags for job")
return nil, err
}
return tags, archive.UpdateTags(j, archiveTags)
}
// Removes a tag from db by tag info
func (r *JobRepository) RemoveTagByRequest(tagType string, tagName string, tagScope string) error {
// Get Tag ID to delete
tagID, exists := r.TagId(tagType, tagName, tagScope)
if !exists {
log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope)
return fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope)
}
// Handle Delete JobTagTable
qJobTag := sq.Delete("jobtag").Where("jobtag.tag_id = ?", tagID)
if _, err := qJobTag.RunWith(r.stmtCache).Exec(); err != nil {
s, _, _ := qJobTag.ToSql()
log.Errorf("Error removing tag from table 'jobTag' with %s: %v", s, err)
return err
}
// Handle Delete TagTable
qTag := sq.Delete("tag").Where("tag.id = ?", tagID)
if _, err := qTag.RunWith(r.stmtCache).Exec(); err != nil {
s, _, _ := qTag.ToSql()
log.Errorf("Error removing tag from table 'tag' with %s: %v", s, err)
return err
}
return nil
}
// Removes a tag from db by tag id
func (r *JobRepository) RemoveTagById(tagID int64) error {
// Handle Delete JobTagTable
qJobTag := sq.Delete("jobtag").Where("jobtag.tag_id = ?", tagID)
if _, err := qJobTag.RunWith(r.stmtCache).Exec(); err != nil {
s, _, _ := qJobTag.ToSql()
log.Errorf("Error removing tag from table 'jobTag' with %s: %v", s, err)
return err
}
// Handle Delete TagTable
qTag := sq.Delete("tag").Where("tag.id = ?", tagID)
if _, err := qTag.RunWith(r.stmtCache).Exec(); err != nil {
s, _, _ := qTag.ToSql()
log.Errorf("Error removing tag from table 'tag' with %s: %v", s, err)
return err
}
return nil
}
// CreateTag creates a new tag with the specified type and name and returns its database id.
func (r *JobRepository) CreateTag(tagType string, tagName string) (tagId int64, err error) {
q := sq.Insert("tag").Columns("tag_type", "tag_name").Values(tagType, tagName)
func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope string) (tagId int64, err error) {
// Default to "Global" scope if none defined
if tagScope == "" {
tagScope = "global"
}
q := sq.Insert("tag").Columns("tag_type", "tag_name", "tag_scope").Values(tagType, tagName, tagScope)
res, err := q.RunWith(r.stmtCache).Exec()
if err != nil {
@ -78,8 +189,9 @@ func (r *JobRepository) CreateTag(tagType string, tagName string) (tagId int64,
}
func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts map[string]int, err error) {
// Fetch all Tags in DB for Display in Frontend Tag-View
tags = make([]schema.Tag, 0, 100)
xrows, err := r.DB.Queryx("SELECT id, tag_type, tag_name FROM tag")
xrows, err := r.DB.Queryx("SELECT id, tag_type, tag_name, tag_scope FROM tag")
if err != nil {
return nil, nil, err
}
@ -89,22 +201,42 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts
if err = xrows.StructScan(&t); err != nil {
return nil, nil, err
}
tags = append(tags, t)
// Handle Scope Filtering: Tag Scope is Global, Private (== Username) or User is auth'd to view Admin Tags
readable, err := r.checkScopeAuth(user, "read", t.Scope)
if err != nil {
return nil, nil, err
}
if readable {
tags = append(tags, t)
}
}
q := sq.Select("t.tag_name, count(jt.tag_id)").
// Query and Count Jobs with attached Tags
q := sq.Select("t.tag_name, t.id, count(jt.tag_id)").
From("tag t").
LeftJoin("jobtag jt ON t.id = jt.tag_id").
GroupBy("t.tag_name")
// Handle Scope Filtering
scopeList := "\"global\""
if user != nil {
scopeList += ",\"" + user.Username + "\""
}
if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
scopeList += ",\"admin\""
}
q = q.Where("t.tag_scope IN (" + scopeList + ")")
// Handle Job Ownership
if user != nil && user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) { // ADMIN || SUPPORT: Count all jobs
log.Debug("CountTags: User Admin or Support -> Count all Jobs for Tags")
// log.Debug("CountTags: User Admin or Support -> Count all Jobs for Tags")
// Unchanged: Needs to be own case still, due to UserRole/NoRole compatibility handling in else case
} else if user != nil && user.HasRole(schema.RoleManager) { // MANAGER: Count own jobs plus project's jobs
// Build ("project1", "project2", ...) list of variable length directly in SQL string
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.user = ? OR job.project IN (\""+strings.Join(user.Projects, "\",\"")+"\"))", user.Username)
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.hpc_user = ? OR job.project IN (\""+strings.Join(user.Projects, "\",\"")+"\"))", user.Username)
} else if user != nil { // USER OR NO ROLE (Compatibility): Only count own jobs
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.user = ?)", user.Username)
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.hpc_user = ?)", user.Username)
}
rows, err := q.RunWith(r.stmtCache).Query()
@ -115,29 +247,44 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts
counts = make(map[string]int)
for rows.Next() {
var tagName string
var tagId int
var count int
if err = rows.Scan(&tagName, &count); err != nil {
if err = rows.Scan(&tagName, &tagId, &count); err != nil {
return nil, nil, err
}
counts[tagName] = count
// Use tagId as second Map-Key component to differentiate tags with identical names
counts[fmt.Sprint(tagName, tagId)] = count
}
err = rows.Err()
return
return tags, counts, err
}
// AddTagOrCreate adds the tag with the specified type and name to the job with the database id `jobId`.
// If such a tag does not yet exist, it is created.
func (r *JobRepository) AddTagOrCreate(jobId int64, tagType string, tagName string) (tagId int64, err error) {
tagId, exists := r.TagId(tagType, tagName)
func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType string, tagName string, tagScope string) (tagId int64, err error) {
// Default to "Global" scope if none defined
if tagScope == "" {
tagScope = "global"
}
writable, err := r.checkScopeAuth(user, "write", tagScope)
if err != nil {
return 0, err
}
if !writable {
return 0, fmt.Errorf("cannot write tag scope with current authorization")
}
tagId, exists := r.TagId(tagType, tagName, tagScope)
if !exists {
tagId, err = r.CreateTag(tagType, tagName)
tagId, err = r.CreateTag(tagType, tagName, tagScope)
if err != nil {
return 0, err
}
}
if _, err := r.AddTag(jobId, tagId); err != nil {
if _, err := r.AddTag(user, jobId, tagId); err != nil {
return 0, err
}
@ -158,19 +305,29 @@ func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool
}
// TagId returns the database id of the tag with the specified type and name.
func (r *JobRepository) TagId(tagType string, tagName string) (tagId int64, exists bool) {
func (r *JobRepository) TagId(tagType string, tagName string, tagScope string) (tagId int64, exists bool) {
exists = true
if err := sq.Select("id").From("tag").
Where("tag.tag_type = ?", tagType).Where("tag.tag_name = ?", tagName).
Where("tag.tag_type = ?", tagType).Where("tag.tag_name = ?", tagName).Where("tag.tag_scope = ?", tagScope).
RunWith(r.stmtCache).QueryRow().Scan(&tagId); err != nil {
exists = false
}
return
}
// GetTags returns a list of all tags if job is nil or of the tags that the job with that database ID has.
func (r *JobRepository) GetTags(job *int64) ([]*schema.Tag, error) {
q := sq.Select("id", "tag_type", "tag_name").From("tag")
// TagInfo returns the database infos of the tag with the specified id.
func (r *JobRepository) TagInfo(tagId int64) (tagType string, tagName string, tagScope string, exists bool) {
exists = true
if err := sq.Select("tag.tag_type", "tag.tag_name", "tag.tag_scope").From("tag").Where("tag.id = ?", tagId).
RunWith(r.stmtCache).QueryRow().Scan(&tagType, &tagName, &tagScope); err != nil {
exists = false
}
return
}
// GetTags returns a list of all scoped tags if job is nil or of the tags that the job with that database ID has.
func (r *JobRepository) GetTags(user *schema.User, job *int64) ([]*schema.Tag, error) {
q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag")
if job != nil {
q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job)
}
@ -185,7 +342,41 @@ func (r *JobRepository) GetTags(job *int64) ([]*schema.Tag, error) {
tags := make([]*schema.Tag, 0)
for rows.Next() {
tag := &schema.Tag{}
if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name); err != nil {
if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
// Handle Scope Filtering: Tag Scope is Global, Private (== Username) or User is auth'd to view Admin Tags
readable, err := r.checkScopeAuth(user, "read", tag.Scope)
if err != nil {
return nil, err
}
if readable {
tags = append(tags, tag)
}
}
return tags, nil
}
// GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has.
func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) {
q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag")
if job != nil {
q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job)
}
rows, err := q.RunWith(r.stmtCache).Query()
if err != nil {
s, _, _ := q.ToSql()
log.Errorf("Error get tags with %s: %v", s, err)
return nil, err
}
tags := make([]*schema.Tag, 0)
for rows.Next() {
tag := &schema.Tag{}
if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
@ -194,3 +385,59 @@ func (r *JobRepository) GetTags(job *int64) ([]*schema.Tag, error) {
return tags, nil
}
func (r *JobRepository) ImportTag(jobId int64, tagType string, tagName string, tagScope string) (err error) {
// Import has no scope ctx, only import from metafile to DB (No recursive archive update required), only returns err
tagId, exists := r.TagId(tagType, tagName, tagScope)
if !exists {
tagId, err = r.CreateTag(tagType, tagName, tagScope)
if err != nil {
return err
}
}
q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobId, tagId)
if _, err := q.RunWith(r.stmtCache).Exec(); err != nil {
s, _, _ := q.ToSql()
log.Errorf("Error adding tag on import with %s: %v", s, err)
return err
}
return nil
}
func (r *JobRepository) checkScopeAuth(user *schema.User, operation string, scope string) (pass bool, err error) {
if user != nil {
switch {
case operation == "write" && scope == "admin":
if user.HasRole(schema.RoleAdmin) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) {
return true, nil
}
return false, nil
case operation == "write" && scope == "global":
if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) {
return true, nil
}
return false, nil
case operation == "write" && scope == user.Username:
return true, nil
case operation == "read" && scope == "admin":
return user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}), nil
case operation == "read" && scope == "global":
return true, nil
case operation == "read" && scope == user.Username:
return true, nil
default:
if operation == "read" || operation == "write" {
// No acceptable scope: deny tag
return false, nil
} else {
return false, fmt.Errorf("error while checking tag operation auth: unknown operation (%s)", operation)
}
}
} else {
return false, fmt.Errorf("error while checking tag operation auth: no user in context")
}
}

Binary file not shown.

Binary file not shown.

View File

@ -6,7 +6,6 @@ package repository
import (
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/jmoiron/sqlx"
)
@ -18,20 +17,12 @@ type Transaction struct {
func (r *JobRepository) TransactionInit() (*Transaction, error) {
var err error
t := new(Transaction)
// Inserts are bundled into transactions because in sqlite,
// that speeds up inserts A LOT.
t.tx, err = r.DB.Beginx()
if err != nil {
log.Warn("Error while bundling transactions")
return nil, err
}
t.stmt, err = t.tx.PrepareNamed(NamedJobInsert)
if err != nil {
log.Warn("Error while preparing namedJobInsert")
return nil, err
}
return t, nil
}
@ -50,7 +41,6 @@ func (r *JobRepository) TransactionCommit(t *Transaction) error {
return err
}
t.stmt = t.tx.NamedStmt(t.stmt)
return nil
}
@ -59,14 +49,17 @@ func (r *JobRepository) TransactionEnd(t *Transaction) error {
log.Warn("Error while committing SQL transactions")
return err
}
return nil
}
func (r *JobRepository) TransactionAdd(t *Transaction, job schema.Job) (int64, error) {
res, err := t.stmt.Exec(job)
func (r *JobRepository) TransactionAddNamed(
t *Transaction,
query string,
args ...interface{},
) (int64, error) {
res, err := t.tx.NamedExec(query, args)
if err != nil {
log.Errorf("repository initDB(): %v", err)
log.Errorf("Named Exec failed: %v", err)
return 0, err
}
@ -79,26 +72,19 @@ func (r *JobRepository) TransactionAdd(t *Transaction, job schema.Job) (int64, e
return id, nil
}
func (r *JobRepository) TransactionAddTag(t *Transaction, tag *schema.Tag) (int64, error) {
res, err := t.tx.Exec(`INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)`, tag.Name, tag.Type)
func (r *JobRepository) TransactionAdd(t *Transaction, query string, args ...interface{}) (int64, error) {
res, err := t.tx.Exec(query, args...)
if err != nil {
log.Errorf("Error while inserting tag into tag table: %v (Type %v)", tag.Name, tag.Type)
return 0, err
}
tagId, err := res.LastInsertId()
if err != nil {
log.Warn("Error while getting last insert ID")
log.Errorf("TransactionAdd(), Exec() Error: %v", err)
return 0, err
}
return tagId, nil
}
func (r *JobRepository) TransactionSetTag(t *Transaction, jobId int64, tagId int64) error {
if _, err := t.tx.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)`, jobId, tagId); err != nil {
log.Errorf("Error while inserting jobtag into jobtag table: %v (TagID %v)", jobId, tagId)
return err
id, err := res.LastInsertId()
if err != nil {
log.Errorf("TransactionAdd(), LastInsertId() Error: %v", err)
return 0, err
}
return nil
return id, nil
}

View File

@ -19,6 +19,7 @@ import (
sq "github.com/Masterminds/squirrel"
"github.com/jmoiron/sqlx"
"golang.org/x/crypto/bcrypt"
"github.com/ClusterCockpit/cc-backend/internal/config"
)
var (
@ -46,8 +47,8 @@ func GetUserRepository() *UserRepository {
func (r *UserRepository) GetUser(username string) (*schema.User, error) {
user := &schema.User{Username: username}
var hashedPassword, name, rawRoles, email, rawProjects sql.NullString
if err := sq.Select("password", "ldap", "name", "roles", "email", "projects").From("user").
Where("user.username = ?", username).RunWith(r.DB).
if err := sq.Select("password", "ldap", "name", "roles", "email", "projects").From("hpc_user").
Where("hpc_user.username = ?", username).RunWith(r.DB).
QueryRow().Scan(&hashedPassword, &user.AuthSource, &name, &rawRoles, &email, &rawProjects); err != nil {
log.Warnf("Error while querying user '%v' from database", username)
return nil, err
@ -72,9 +73,8 @@ func (r *UserRepository) GetUser(username string) (*schema.User, error) {
}
func (r *UserRepository) GetLdapUsernames() ([]string, error) {
var users []string
rows, err := r.DB.Query(`SELECT username FROM user WHERE user.ldap = 1`)
rows, err := r.DB.Query(`SELECT username FROM hpc_user WHERE hpc_user.ldap = 1`)
if err != nil {
log.Warn("Error while querying usernames")
return nil, err
@ -122,18 +122,62 @@ func (r *UserRepository) AddUser(user *schema.User) error {
vals = append(vals, int(user.AuthSource))
}
if _, err := sq.Insert("user").Columns(cols...).Values(vals...).RunWith(r.DB).Exec(); err != nil {
if _, err := sq.Insert("hpc_user").Columns(cols...).Values(vals...).RunWith(r.DB).Exec(); err != nil {
log.Errorf("Error while inserting new user '%v' into DB", user.Username)
return err
}
log.Infof("new user %#v created (roles: %s, auth-source: %d, projects: %s)", user.Username, rolesJson, user.AuthSource, projectsJson)
defaultMetricsCfg, err := config.LoadDefaultMetricsConfig()
if err != nil {
log.Errorf("Error loading default metrics config: %v", err)
} else if defaultMetricsCfg != nil {
for _, cluster := range defaultMetricsCfg.Clusters {
metricsArray := config.ParseMetricsString(cluster.DefaultMetrics)
metricsJSON, err := json.Marshal(metricsArray)
if err != nil {
log.Errorf("Error marshaling default metrics for cluster %s: %v", cluster.Name, err)
continue
}
confKey := "job_view_selectedMetrics:" + cluster.Name
if _, err := sq.Insert("configuration").
Columns("username", "confkey", "value").
Values(user.Username, confKey, string(metricsJSON)).
RunWith(r.DB).Exec(); err != nil {
log.Errorf("Error inserting default job view metrics for user %s and cluster %s: %v", user.Username, cluster.Name, err)
} else {
log.Infof("Default job view metrics for user %s and cluster %s set to %s", user.Username, cluster.Name, string(metricsJSON))
}
}
}
return nil
}
func (r *UserRepository) UpdateUser(dbUser *schema.User, user *schema.User) error {
// user contains updated info, apply to dbuser
// TODO: Discuss updatable fields
if dbUser.Name != user.Name {
if _, err := sq.Update("hpc_user").Set("name", user.Name).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil {
log.Errorf("error while updating name of user '%s'", user.Username)
return err
}
}
// Toggled until greenlit
// if dbUser.HasRole(schema.RoleManager) && !reflect.DeepEqual(dbUser.Projects, user.Projects) {
// projects, _ := json.Marshal(user.Projects)
// if _, err := sq.Update("hpc_user").Set("projects", projects).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil {
// return err
// }
// }
return nil
}
func (r *UserRepository) DelUser(username string) error {
_, err := r.DB.Exec(`DELETE FROM user WHERE user.username = ?`, username)
_, err := r.DB.Exec(`DELETE FROM hpc_user WHERE hpc_user.username = ?`, username)
if err != nil {
log.Errorf("Error while deleting user '%s' from DB", username)
return err
@ -143,8 +187,7 @@ func (r *UserRepository) DelUser(username string) error {
}
func (r *UserRepository) ListUsers(specialsOnly bool) ([]*schema.User, error) {
q := sq.Select("username", "name", "email", "roles", "projects").From("user")
q := sq.Select("username", "name", "email", "roles", "projects").From("hpc_user")
if specialsOnly {
q = q.Where("(roles != '[\"user\"]' AND roles != '[]')")
}
@ -186,8 +229,8 @@ func (r *UserRepository) ListUsers(specialsOnly bool) ([]*schema.User, error) {
func (r *UserRepository) AddRole(
ctx context.Context,
username string,
queryrole string) error {
queryrole string,
) error {
newRole := strings.ToLower(queryrole)
user, err := r.GetUser(username)
if err != nil {
@ -198,15 +241,15 @@ func (r *UserRepository) AddRole(
exists, valid := user.HasValidRole(newRole)
if !valid {
return fmt.Errorf("Supplied role is no valid option : %v", newRole)
return fmt.Errorf("supplied role is no valid option : %v", newRole)
}
if exists {
return fmt.Errorf("User %v already has role %v", username, newRole)
return fmt.Errorf("user %v already has role %v", username, newRole)
}
roles, _ := json.Marshal(append(user.Roles, newRole))
if _, err := sq.Update("user").Set("roles", roles).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
log.Errorf("Error while adding new role for user '%s'", user.Username)
if _, err := sq.Update("hpc_user").Set("roles", roles).Where("hpc_user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
log.Errorf("error while adding new role for user '%s'", user.Username)
return err
}
return nil
@ -223,14 +266,14 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr
exists, valid := user.HasValidRole(oldRole)
if !valid {
return fmt.Errorf("Supplied role is no valid option : %v", oldRole)
return fmt.Errorf("supplied role is no valid option : %v", oldRole)
}
if !exists {
return fmt.Errorf("Role already deleted for user '%v': %v", username, oldRole)
return fmt.Errorf("role already deleted for user '%v': %v", username, oldRole)
}
if oldRole == schema.GetRoleString(schema.RoleManager) && len(user.Projects) != 0 {
return fmt.Errorf("Cannot remove role 'manager' while user %s still has assigned project(s) : %v", username, user.Projects)
return fmt.Errorf("cannot remove role 'manager' while user %s still has assigned project(s) : %v", username, user.Projects)
}
var newroles []string
@ -240,8 +283,8 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr
}
}
var mroles, _ = json.Marshal(newroles)
if _, err := sq.Update("user").Set("roles", mroles).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
mroles, _ := json.Marshal(newroles)
if _, err := sq.Update("hpc_user").Set("roles", mroles).Where("hpc_user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
log.Errorf("Error while removing role for user '%s'", user.Username)
return err
}
@ -251,15 +294,15 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr
func (r *UserRepository) AddProject(
ctx context.Context,
username string,
project string) error {
project string,
) error {
user, err := r.GetUser(username)
if err != nil {
return err
}
if !user.HasRole(schema.RoleManager) {
return fmt.Errorf("user '%s' is not a manager!", username)
return fmt.Errorf("user '%s' is not a manager", username)
}
if user.HasProject(project) {
@ -267,7 +310,7 @@ func (r *UserRepository) AddProject(
}
projects, _ := json.Marshal(append(user.Projects, project))
if _, err := sq.Update("user").Set("projects", projects).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
if _, err := sq.Update("hpc_user").Set("projects", projects).Where("hpc_user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
return err
}
@ -281,11 +324,11 @@ func (r *UserRepository) RemoveProject(ctx context.Context, username string, pro
}
if !user.HasRole(schema.RoleManager) {
return fmt.Errorf("user '%#v' is not a manager!", username)
return fmt.Errorf("user '%#v' is not a manager", username)
}
if !user.HasProject(project) {
return fmt.Errorf("user '%#v': Cannot remove project '%#v' - Does not match!", username, project)
return fmt.Errorf("user '%#v': Cannot remove project '%#v' - Does not match", username, project)
}
var exists bool
@ -298,14 +341,14 @@ func (r *UserRepository) RemoveProject(ctx context.Context, username string, pro
}
}
if exists == true {
if exists {
var result interface{}
if len(newprojects) == 0 {
result = "[]"
} else {
result, _ = json.Marshal(newprojects)
}
if _, err := sq.Update("user").Set("projects", result).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
if _, err := sq.Update("hpc_user").Set("projects", result).Where("hpc_user.username = ?", username).RunWith(r.DB).Exec(); err != nil {
return err
}
return nil
@ -321,9 +364,10 @@ const ContextUserKey ContextKey = "user"
func GetUserFromContext(ctx context.Context) *schema.User {
x := ctx.Value(ContextUserKey)
if x == nil {
log.Warnf("no user retrieved from context")
return nil
}
// log.Infof("user retrieved from context: %v", x.(*schema.User))
return x.(*schema.User)
}
@ -336,7 +380,7 @@ func (r *UserRepository) FetchUserInCtx(ctx context.Context, username string) (*
user := &model.User{Username: username}
var name, email sql.NullString
if err := sq.Select("name", "email").From("user").Where("user.username = ?", username).
if err := sq.Select("name", "email").From("hpc_user").Where("hpc_user.username = ?", username).
RunWith(r.DB).QueryRow().Scan(&name, &email); err != nil {
if err == sql.ErrNoRows {
/* This warning will be logged *often* for non-local users, i.e. users mentioned only in job-table or archive, */

View File

@ -24,9 +24,9 @@ var (
type UserCfgRepo struct {
DB *sqlx.DB
Lookup *sqlx.Stmt
lock sync.RWMutex
uiDefaults map[string]interface{}
cache *lrucache.Cache
lock sync.RWMutex
}
func GetUserCfgRepo() *UserCfgRepo {
@ -35,7 +35,7 @@ func GetUserCfgRepo() *UserCfgRepo {
lookupConfigStmt, err := db.DB.Preparex(`SELECT confkey, value FROM configuration WHERE configuration.username = ?`)
if err != nil {
log.Fatalf("db.DB.Preparex() error: %v", err)
log.Fatalf("User Config: Call 'db.DB.Preparex()' failed.\nError: %s\n", err.Error())
}
userCfgRepoInstance = &UserCfgRepo{
@ -112,8 +112,8 @@ func (uCfg *UserCfgRepo) GetUIConfig(user *schema.User) (map[string]interface{},
// configuration.
func (uCfg *UserCfgRepo) UpdateConfig(
key, value string,
user *schema.User) error {
user *schema.User,
) error {
if user == nil {
var val interface{}
if err := json.Unmarshal([]byte(value), &val); err != nil {

View File

@ -25,6 +25,9 @@ func setupUserTest(t *testing.T) *UserCfgRepo {
"jwts": {
"max-age": "2m"
},
"apiAllowedIPs": [
"*"
],
"clusters": [
{
"name": "testcluster",

View File

@ -13,6 +13,7 @@ import (
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/util"
@ -34,32 +35,38 @@ type Route struct {
var routes []Route = []Route{
{"/", "home.tmpl", "ClusterCockpit", false, setupHomeRoute},
{"/config", "config.tmpl", "Settings", false, func(i InfoType, r *http.Request) InfoType { return i }},
{"/config", "config.tmpl", "Settings", false, setupConfigRoute},
{"/monitoring/jobs/", "monitoring/jobs.tmpl", "Jobs - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { return i }},
{"/monitoring/job/{id:[0-9]+}", "monitoring/job.tmpl", "Job <ID> - ClusterCockpit", false, setupJobRoute},
{"/monitoring/users/", "monitoring/list.tmpl", "Users - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { i["listType"] = "USER"; return i }},
{"/monitoring/projects/", "monitoring/list.tmpl", "Projects - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { i["listType"] = "PROJECT"; return i }},
{"/monitoring/tags/", "monitoring/taglist.tmpl", "Tags - ClusterCockpit", false, setupTaglistRoute},
{"/monitoring/user/{id}", "monitoring/user.tmpl", "User <ID> - ClusterCockpit", true, setupUserRoute},
{"/monitoring/systems/{cluster}", "monitoring/systems.tmpl", "Cluster <ID> - ClusterCockpit", false, setupClusterRoute},
{"/monitoring/systems/{cluster}", "monitoring/systems.tmpl", "Cluster <ID> Node Overview - ClusterCockpit", false, setupClusterOverviewRoute},
{"/monitoring/systems/list/{cluster}", "monitoring/systems.tmpl", "Cluster <ID> Node List - ClusterCockpit", false, setupClusterListRoute},
{"/monitoring/systems/list/{cluster}/{subcluster}", "monitoring/systems.tmpl", "Cluster <ID> <SID> Node List - ClusterCockpit", false, setupClusterListRoute},
{"/monitoring/node/{cluster}/{hostname}", "monitoring/node.tmpl", "Node <ID> - ClusterCockpit", false, setupNodeRoute},
{"/monitoring/analysis/{cluster}", "monitoring/analysis.tmpl", "Analysis - ClusterCockpit", true, setupAnalysisRoute},
{"/monitoring/status/{cluster}", "monitoring/status.tmpl", "Status of <ID> - ClusterCockpit", false, setupClusterRoute},
{"/monitoring/status/{cluster}", "monitoring/status.tmpl", "Status of <ID> - ClusterCockpit", false, setupClusterStatusRoute},
}
func setupHomeRoute(i InfoType, r *http.Request) InfoType {
jobRepo := repository.GetJobRepository()
groupBy := model.AggregateCluster
// startJobCount := time.Now()
stats, err := jobRepo.JobCountGrouped(r.Context(), nil, &groupBy)
if err != nil {
log.Warnf("failed to count jobs: %s", err.Error())
}
// log.Infof("Timer HOME ROUTE startJobCount: %s", time.Since(startJobCount))
// startRunningJobCount := time.Now()
stats, err = jobRepo.AddJobCountGrouped(r.Context(), nil, &groupBy, stats, "running")
if err != nil {
log.Warnf("failed to count running jobs: %s", err.Error())
}
// log.Infof("Timer HOME ROUTE startRunningJobCount: %s", time.Since(startRunningJobCount))
i["clusters"] = stats
@ -75,8 +82,22 @@ func setupHomeRoute(i InfoType, r *http.Request) InfoType {
return i
}
func setupConfigRoute(i InfoType, r *http.Request) InfoType {
if util.CheckFileExists("./var/notice.txt") {
msg, err := os.ReadFile("./var/notice.txt")
if err == nil {
i["ncontent"] = string(msg)
}
}
return i
}
func setupJobRoute(i InfoType, r *http.Request) InfoType {
i["id"] = mux.Vars(r)["id"]
if config.Keys.EmissionConstant != 0 {
i["emission"] = config.Keys.EmissionConstant
}
return i
}
@ -92,7 +113,7 @@ func setupUserRoute(i InfoType, r *http.Request) InfoType {
return i
}
func setupClusterRoute(i InfoType, r *http.Request) InfoType {
func setupClusterStatusRoute(i InfoType, r *http.Request) InfoType {
vars := mux.Vars(r)
i["id"] = vars["cluster"]
i["cluster"] = vars["cluster"]
@ -104,6 +125,36 @@ func setupClusterRoute(i InfoType, r *http.Request) InfoType {
return i
}
func setupClusterOverviewRoute(i InfoType, r *http.Request) InfoType {
vars := mux.Vars(r)
i["id"] = vars["cluster"]
i["cluster"] = vars["cluster"]
i["displayType"] = "OVERVIEW"
from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to")
if from != "" || to != "" {
i["from"] = from
i["to"] = to
}
return i
}
func setupClusterListRoute(i InfoType, r *http.Request) InfoType {
vars := mux.Vars(r)
i["id"] = vars["cluster"]
i["cluster"] = vars["cluster"]
i["sid"] = vars["subcluster"]
i["subCluster"] = vars["subcluster"]
i["displayType"] = "LIST"
from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to")
if from != "" || to != "" {
i["from"] = from
i["to"] = to
}
return i
}
func setupNodeRoute(i InfoType, r *http.Request) InfoType {
vars := mux.Vars(r)
i["cluster"] = vars["cluster"]
@ -124,28 +175,46 @@ func setupAnalysisRoute(i InfoType, r *http.Request) InfoType {
func setupTaglistRoute(i InfoType, r *http.Request) InfoType {
jobRepo := repository.GetJobRepository()
user := repository.GetUserFromContext(r.Context())
tags, counts, err := jobRepo.CountTags(user)
tags, counts, err := jobRepo.CountTags(repository.GetUserFromContext(r.Context()))
tagMap := make(map[string][]map[string]interface{})
if err != nil {
log.Warnf("GetTags failed: %s", err.Error())
i["tagmap"] = tagMap
return i
}
for _, tag := range tags {
tagItem := map[string]interface{}{
"id": tag.ID,
"name": tag.Name,
"count": counts[tag.Name],
// Reduces displayed tags for unauth'd users
userAuthlevel := repository.GetUserFromContext(r.Context()).GetAuthLevel()
// Uses tag.ID as second Map-Key component to differentiate tags with identical names
if userAuthlevel >= 4 { // Support+ : Show tags for all scopes, regardless of count
for _, tag := range tags {
tagItem := map[string]interface{}{
"id": tag.ID,
"name": tag.Name,
"scope": tag.Scope,
"count": counts[fmt.Sprint(tag.Name, tag.ID)],
}
tagMap[tag.Type] = append(tagMap[tag.Type], tagItem)
}
tagMap[tag.Type] = append(tagMap[tag.Type], tagItem)
}
} else if userAuthlevel < 4 && userAuthlevel >= 2 { // User+ : Show global and admin scope only if at least 1 tag used, private scope regardless of count
for _, tag := range tags {
tagCount := counts[fmt.Sprint(tag.Name, tag.ID)]
if ((tag.Scope == "global" || tag.Scope == "admin") && tagCount >= 1) || (tag.Scope != "global" && tag.Scope != "admin") {
tagItem := map[string]interface{}{
"id": tag.ID,
"name": tag.Name,
"scope": tag.Scope,
"count": tagCount,
}
tagMap[tag.Type] = append(tagMap[tag.Type], tagItem)
}
}
} // auth < 2 return nothing for this route
i["tagmap"] = tagMap
return i
}
// FIXME: Lots of redundant code. Needs refactoring
func buildFilterPresets(query url.Values) map[string]interface{} {
filterPresets := map[string]interface{}{}
@ -208,6 +277,16 @@ func buildFilterPresets(query url.Values) map[string]interface{} {
}
}
}
if query.Get("numHWThreads") != "" {
parts := strings.Split(query.Get("numHWThreads"), "-")
if len(parts) == 2 {
a, e1 := strconv.Atoi(parts[0])
b, e2 := strconv.Atoi(parts[1])
if e1 == nil && e2 == nil {
filterPresets["numHWThreads"] = map[string]int{"from": a, "to": b}
}
}
}
if query.Get("numAccelerators") != "" {
parts := strings.Split(query.Get("numAccelerators"), "-")
if len(parts) == 2 {
@ -218,6 +297,9 @@ func buildFilterPresets(query url.Values) map[string]interface{} {
}
}
}
if len(query["dbId"]) != 0 {
filterPresets["dbId"] = query["dbId"]
}
if query.Get("jobId") != "" {
if len(query["jobId"]) == 1 {
filterPresets["jobId"] = query.Get("jobId")
@ -234,7 +316,7 @@ func buildFilterPresets(query url.Values) map[string]interface{} {
}
if query.Get("startTime") != "" {
parts := strings.Split(query.Get("startTime"), "-")
if len(parts) == 2 {
if len(parts) == 2 { // Time in seconds, from - to
a, e1 := strconv.ParseInt(parts[0], 10, 64)
b, e2 := strconv.ParseInt(parts[1], 10, 64)
if e1 == nil && e2 == nil {
@ -243,9 +325,41 @@ func buildFilterPresets(query url.Values) map[string]interface{} {
"to": time.Unix(b, 0).Format(time.RFC3339),
}
}
} else { // named range
filterPresets["startTime"] = map[string]string{
"range": query.Get("startTime"),
}
}
}
if query.Get("energy") != "" {
parts := strings.Split(query.Get("energy"), "-")
if len(parts) == 2 {
a, e1 := strconv.Atoi(parts[0])
b, e2 := strconv.Atoi(parts[1])
if e1 == nil && e2 == nil {
filterPresets["energy"] = map[string]int{"from": a, "to": b}
}
}
}
if len(query["stat"]) != 0 {
statList := make([]map[string]interface{}, 0)
for _, statEntry := range query["stat"] {
parts := strings.Split(statEntry, "-")
if len(parts) == 3 { // Metric Footprint Stat Field, from - to
a, e1 := strconv.ParseInt(parts[1], 10, 64)
b, e2 := strconv.ParseInt(parts[2], 10, 64)
if e1 == nil && e2 == nil {
statEntry := map[string]interface{}{
"field": parts[0],
"from": a,
"to": b,
}
statList = append(statList, statEntry)
}
}
}
filterPresets["stats"] = statList
}
return filterPresets
}
@ -264,20 +378,25 @@ func SetupRoutes(router *mux.Router, buildInfo web.Build) {
infos := route.Setup(map[string]interface{}{}, r)
if id, ok := infos["id"]; ok {
title = strings.Replace(route.Title, "<ID>", id.(string), 1)
if sid, ok := infos["sid"]; ok { // 2nd ID element
title = strings.Replace(title, "<SID>", sid.(string), 1)
}
}
// Get User -> What if NIL?
user := repository.GetUserFromContext(r.Context())
// Get Roles
availableRoles, _ := schema.GetValidRolesMap(user)
page := web.Page{
Title: title,
User: *user,
Roles: availableRoles,
Build: buildInfo,
Config: conf,
Infos: infos,
Title: title,
User: *user,
Roles: availableRoles,
Build: buildInfo,
Config: conf,
Resampling: config.Keys.EnableResampling,
Infos: infos,
}
if route.Filter {
@ -302,11 +421,19 @@ func HandleSearchBar(rw http.ResponseWriter, r *http.Request, buildInfo web.Buil
case "jobId":
http.Redirect(rw, r, "/monitoring/jobs/?jobId="+url.QueryEscape(strings.Trim(splitSearch[1], " ")), http.StatusFound) // All Users: Redirect to Tablequery
case "jobName":
http.Redirect(rw, r, "/monitoring/jobs/?jobName="+url.QueryEscape(strings.Trim(splitSearch[1], " ")), http.StatusFound) // All Users: Redirect to Tablequery
// Add Last 30 Days to migitate timeouts
untilTime := strconv.FormatInt(time.Now().Unix(), 10)
fromTime := strconv.FormatInt((time.Now().Unix() - int64(30*24*3600)), 10)
http.Redirect(rw, r, "/monitoring/jobs/?startTime="+fromTime+"-"+untilTime+"&jobName="+url.QueryEscape(strings.Trim(splitSearch[1], " ")), http.StatusFound) // All Users: Redirect to Tablequery
case "projectId":
http.Redirect(rw, r, "/monitoring/jobs/?projectMatch=eq&project="+url.QueryEscape(strings.Trim(splitSearch[1], " ")), http.StatusFound) // All Users: Redirect to Tablequery
case "arrayJobId":
http.Redirect(rw, r, "/monitoring/jobs/?arrayJobId="+url.QueryEscape(strings.Trim(splitSearch[1], " ")), http.StatusFound) // All Users: Redirect to Tablequery
// Add Last 30 Days to migitate timeouts
untilTime := strconv.FormatInt(time.Now().Unix(), 10)
fromTime := strconv.FormatInt((time.Now().Unix() - int64(30*24*3600)), 10)
http.Redirect(rw, r, "/monitoring/jobs/?startTime="+fromTime+"-"+untilTime+"&arrayJobId="+url.QueryEscape(strings.Trim(splitSearch[1], " ")), http.StatusFound) // All Users: Redirect to Tablequery
case "username":
if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport, schema.RoleManager}) {
http.Redirect(rw, r, "/monitoring/users/?user="+url.QueryEscape(strings.Trim(splitSearch[1], " ")), http.StatusFound)
@ -339,7 +466,11 @@ func HandleSearchBar(rw http.ResponseWriter, r *http.Request, buildInfo web.Buil
} else if project != "" {
http.Redirect(rw, r, "/monitoring/jobs/?projectMatch=eq&project="+url.QueryEscape(project), http.StatusFound) // projectId (equal)
} else if jobname != "" {
http.Redirect(rw, r, "/monitoring/jobs/?jobName="+url.QueryEscape(jobname), http.StatusFound) // JobName (contains)
// Add Last 30 Days to migitate timeouts
untilTime := strconv.FormatInt(time.Now().Unix(), 10)
fromTime := strconv.FormatInt((time.Now().Unix() - int64(30*24*3600)), 10)
http.Redirect(rw, r, "/monitoring/jobs/?startTime="+fromTime+"-"+untilTime+"&jobName="+url.QueryEscape(jobname), http.StatusFound) // 30D Fitler + JobName (contains)
} else {
web.RenderTemplate(rw, "message.tmpl", &web.Page{Title: "Info", MsgType: "alert-info", Message: "Search without result", User: *user, Roles: availableRoles, Build: buildInfo})
}

View File

@ -0,0 +1,41 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskManager
import (
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/go-co-op/gocron/v2"
)
func RegisterCompressionService(compressOlderThan int) {
log.Info("Register compression service")
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(05, 0, 0))),
gocron.NewTask(
func() {
var jobs []*schema.Job
var err error
ar := archive.GetHandle()
startTime := time.Now().Unix() - int64(compressOlderThan*24*3600)
lastTime := ar.CompressLast(startTime)
if startTime == lastTime {
log.Info("Compression Service - Complete archive run")
jobs, err = jobRepo.FindJobsBetween(0, startTime)
} else {
jobs, err = jobRepo.FindJobsBetween(lastTime, startTime)
}
if err != nil {
log.Warnf("Error while looking for compression jobs: %v", err)
}
ar.Compress(jobs)
}))
}

View File

@ -0,0 +1,36 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskManager
import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/go-co-op/gocron/v2"
)
func RegisterLdapSyncService(ds string) {
interval, err := parseDuration(ds)
if err != nil {
log.Warnf("Could not parse duration for sync interval: %v",
ds)
return
}
auth := auth.GetAuthInstance()
log.Info("Register LDAP sync service")
s.NewJob(gocron.DurationJob(interval),
gocron.NewTask(
func() {
t := time.Now()
log.Printf("ldap sync started at %s", t.Format(time.RFC3339))
if err := auth.LdapAuth.Sync(); err != nil {
log.Errorf("ldap sync failed: %s", err.Error())
}
log.Print("ldap sync done")
}))
}

View File

@ -0,0 +1,67 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskManager
import (
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/go-co-op/gocron/v2"
)
func RegisterRetentionDeleteService(age int, includeDB bool) {
log.Info("Register retention delete service")
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(04, 0, 0))),
gocron.NewTask(
func() {
startTime := time.Now().Unix() - int64(age*24*3600)
jobs, err := jobRepo.FindJobsBetween(0, startTime)
if err != nil {
log.Warnf("Error while looking for retention jobs: %s", err.Error())
}
archive.GetHandle().CleanUp(jobs)
if includeDB {
cnt, err := jobRepo.DeleteJobsBefore(startTime)
if err != nil {
log.Errorf("Error while deleting retention jobs from db: %s", err.Error())
} else {
log.Infof("Retention: Removed %d jobs from db", cnt)
}
if err = jobRepo.Optimize(); err != nil {
log.Errorf("Error occured in db optimization: %s", err.Error())
}
}
}))
}
func RegisterRetentionMoveService(age int, includeDB bool, location string) {
log.Info("Register retention move service")
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(04, 0, 0))),
gocron.NewTask(
func() {
startTime := time.Now().Unix() - int64(age*24*3600)
jobs, err := jobRepo.FindJobsBetween(0, startTime)
if err != nil {
log.Warnf("Error while looking for retention jobs: %s", err.Error())
}
archive.GetHandle().Move(jobs, location)
if includeDB {
cnt, err := jobRepo.DeleteJobsBefore(startTime)
if err != nil {
log.Errorf("Error while deleting retention jobs from db: %v", err)
} else {
log.Infof("Retention: Removed %d jobs from db", cnt)
}
if err = jobRepo.Optimize(); err != nil {
log.Errorf("Error occured in db optimization: %v", err)
}
}
}))
}

View File

@ -0,0 +1,27 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskManager
import (
"runtime"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/go-co-op/gocron/v2"
)
func RegisterStopJobsExceedTime() {
log.Info("Register undead jobs service")
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(03, 0, 0))),
gocron.NewTask(
func() {
err := jobRepo.StopJobsExceedingWalltimeBy(config.Keys.StopJobsExceedingWalltime)
if err != nil {
log.Warnf("Error while looking for jobs exceeding their walltime: %s", err.Error())
}
runtime.GC()
}))
}

View File

@ -0,0 +1,90 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskManager
import (
"encoding/json"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/go-co-op/gocron/v2"
)
var (
s gocron.Scheduler
jobRepo *repository.JobRepository
)
func parseDuration(s string) (time.Duration, error) {
interval, err := time.ParseDuration(s)
if err != nil {
log.Warnf("Could not parse duration for sync interval: %v",
s)
return 0, err
}
if interval == 0 {
log.Info("TaskManager: Sync interval is zero")
}
return interval, nil
}
func Start() {
var err error
jobRepo = repository.GetJobRepository()
s, err = gocron.NewScheduler()
if err != nil {
log.Abortf("Taskmanager Start: Could not create gocron scheduler.\nError: %s\n", err.Error())
}
if config.Keys.StopJobsExceedingWalltime > 0 {
RegisterStopJobsExceedTime()
}
var cfg struct {
Retention schema.Retention `json:"retention"`
Compression int `json:"compression"`
}
cfg.Retention.IncludeDB = true
if err := json.Unmarshal(config.Keys.Archive, &cfg); err != nil {
log.Warn("Error while unmarshaling raw config json")
}
switch cfg.Retention.Policy {
case "delete":
RegisterRetentionDeleteService(
cfg.Retention.Age,
cfg.Retention.IncludeDB)
case "move":
RegisterRetentionMoveService(
cfg.Retention.Age,
cfg.Retention.IncludeDB,
cfg.Retention.Location)
}
if cfg.Compression > 0 {
RegisterCompressionService(cfg.Compression)
}
lc := config.Keys.LdapConfig
if lc != nil && lc.SyncInterval != "" {
RegisterLdapSyncService(lc.SyncInterval)
}
RegisterFootprintWorker()
RegisterUpdateDurationWorker()
s.Start()
}
func Shutdown() {
s.Shutdown()
}

View File

@ -0,0 +1,33 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskManager
import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/go-co-op/gocron/v2"
)
func RegisterUpdateDurationWorker() {
var frequency string
if config.Keys.CronFrequency != nil && config.Keys.CronFrequency.DurationWorker != "" {
frequency = config.Keys.CronFrequency.DurationWorker
} else {
frequency = "5m"
}
d, _ := time.ParseDuration(frequency)
log.Infof("Register Duration Update service with %s interval", frequency)
s.NewJob(gocron.DurationJob(d),
gocron.NewTask(
func() {
start := time.Now()
log.Printf("Update duration started at %s", start.Format(time.RFC3339))
jobRepo.UpdateDuration()
log.Printf("Update duration is done and took %s", time.Since(start))
}))
}

View File

@ -0,0 +1,146 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskManager
import (
"context"
"math"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
"github.com/go-co-op/gocron/v2"
)
func RegisterFootprintWorker() {
var frequency string
if config.Keys.CronFrequency != nil && config.Keys.CronFrequency.FootprintWorker != "" {
frequency = config.Keys.CronFrequency.FootprintWorker
} else {
frequency = "10m"
}
d, _ := time.ParseDuration(frequency)
log.Infof("Register Footprint Update service with %s interval", frequency)
s.NewJob(gocron.DurationJob(d),
gocron.NewTask(
func() {
s := time.Now()
c := 0
ce := 0
cl := 0
log.Printf("Update Footprints started at %s", s.Format(time.RFC3339))
for _, cluster := range archive.Clusters {
s_cluster := time.Now()
jobs, err := jobRepo.FindRunningJobs(cluster.Name)
if err != nil {
continue
}
// NOTE: Additional Subcluster Loop Could Allow For Limited List Of Footprint-Metrics Only.
// - Chunk-Size Would Then Be 'SubCluster' (Running Jobs, Transactions) as Lists Can Change Within SCs
// - Would Require Review of 'updateFootprint' Usage (Logic Could Possibly Be Included Here Completely)
allMetrics := make([]string, 0)
metricConfigs := archive.GetCluster(cluster.Name).MetricConfig
for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name)
}
repo, err := metricdata.GetMetricDataRepo(cluster.Name)
if err != nil {
log.Errorf("no metric data repository configured for '%s'", cluster.Name)
continue
}
pendingStatements := []sq.UpdateBuilder{}
for _, job := range jobs {
log.Debugf("Prepare job %d", job.JobID)
cl++
s_job := time.Now()
jobStats, err := repo.LoadStats(job, allMetrics, context.Background())
if err != nil {
log.Errorf("error wile loading job data stats for footprint update: %v", err)
ce++
continue
}
jobMeta := &schema.JobMeta{
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Statistics: make(map[string]schema.JobStatistics),
}
for _, metric := range allMetrics {
avg, min, max := 0.0, 0.0, 0.0
data, ok := jobStats[metric] // JobStats[Metric1:[Hostname1:[Stats], Hostname2:[Stats], ...], Metric2[...] ...]
if ok {
for _, res := range job.Resources {
hostStats, ok := data[res.Hostname]
if ok {
avg += hostStats.Avg
min = math.Min(min, hostStats.Min)
max = math.Max(max, hostStats.Max)
}
}
}
// Add values rounded to 2 digits: repo.LoadStats may return unrounded
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
},
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
Min: (math.Round(min*100) / 100),
Max: (math.Round(max*100) / 100),
}
}
// Build Statement per Job, Add to Pending Array
stmt := sq.Update("job")
stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta)
if err != nil {
log.Errorf("update job (dbid: %d) statement build failed at footprint step: %s", job.ID, err.Error())
ce++
continue
}
stmt = stmt.Where("job.id = ?", job.ID)
pendingStatements = append(pendingStatements, stmt)
log.Debugf("Job %d took %s", job.JobID, time.Since(s_job))
}
t, err := jobRepo.TransactionInit()
if err != nil {
log.Errorf("failed TransactionInit %v", err)
log.Errorf("skipped %d transactions for cluster %s", len(pendingStatements), cluster.Name)
ce += len(pendingStatements)
} else {
for _, ps := range pendingStatements {
query, args, err := ps.ToSql()
if err != nil {
log.Errorf("failed in ToSQL conversion: %v", err)
ce++
} else {
// args...: Footprint-JSON, Energyfootprint-JSON, TotalEnergy, JobID
jobRepo.TransactionAdd(t, query, args...)
c++
}
}
jobRepo.TransactionEnd(t)
}
log.Debugf("Finish Cluster %s, took %s", cluster.Name, time.Since(s_cluster))
}
log.Printf("Updating %d (of %d; Skipped %d) Footprints is done and took %s", c, cl, ce, time.Since(s))
}))
}

View File

@ -4,7 +4,13 @@
// license that can be found in the LICENSE file.
package util
import "golang.org/x/exp/constraints"
import (
"golang.org/x/exp/constraints"
"fmt"
"math"
"sort"
)
func Min[T constraints.Ordered](a, b T) T {
if a < b {
@ -19,3 +25,36 @@ func Max[T constraints.Ordered](a, b T) T {
}
return b
}
func sortedCopy(input []float64) []float64 {
sorted := make([]float64, len(input))
copy(sorted, input)
sort.Float64s(sorted)
return sorted
}
func Mean(input []float64) (float64, error) {
if len(input) == 0 {
return math.NaN(), fmt.Errorf("input array is empty: %#v", input)
}
sum := 0.0
for _, n := range input {
sum += n
}
return sum / float64(len(input)), nil
}
func Median(input []float64) (median float64, err error) {
c := sortedCopy(input)
// Even numbers: add the two middle numbers, divide by two (use mean function)
// Odd numbers: Use the middle number
l := len(c)
if l == 0 {
return math.NaN(), fmt.Errorf("input array is empty: %#v", input)
} else if l%2 == 0 {
median, _ = Mean(c[l/2-1 : l/2+1])
} else {
median = c[l/2]
}
return median, nil
}

View File

@ -7,13 +7,14 @@ package archive
import (
"encoding/json"
"fmt"
"sync"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
const Version uint64 = 1
const Version uint64 = 2
type ArchiveBackend interface {
Init(rawConfig json.RawMessage) (uint64, error)
@ -26,6 +27,8 @@ type ArchiveBackend interface {
LoadJobData(job *schema.Job) (schema.JobData, error)
LoadJobStats(job *schema.Job) (schema.ScopedJobStats, error)
LoadClusterCfg(name string) (*schema.Cluster, error)
StoreJobMeta(jobMeta *schema.JobMeta) error
@ -53,47 +56,55 @@ type JobContainer struct {
}
var (
initOnce sync.Once
cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
ar ArchiveBackend
useArchive bool
)
func Init(rawConfig json.RawMessage, disableArchive bool) error {
useArchive = !disableArchive
var err error
var cfg struct {
Kind string `json:"kind"`
}
initOnce.Do(func() {
useArchive = !disableArchive
if err := json.Unmarshal(rawConfig, &cfg); err != nil {
log.Warn("Error while unmarshaling raw config json")
return err
}
var cfg struct {
Kind string `json:"kind"`
}
switch cfg.Kind {
case "file":
ar = &FsArchive{}
// case "s3":
// ar = &S3Archive{}
default:
return fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", cfg.Kind)
}
if err = json.Unmarshal(rawConfig, &cfg); err != nil {
log.Warn("Error while unmarshaling raw config json")
return
}
version, err := ar.Init(rawConfig)
if err != nil {
log.Error("Error while initializing archiveBackend")
return err
}
log.Infof("Load archive version %d", version)
switch cfg.Kind {
case "file":
ar = &FsArchive{}
// case "s3":
// ar = &S3Archive{}
default:
err = fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", cfg.Kind)
}
return initClusterConfig()
var version uint64
version, err = ar.Init(rawConfig)
if err != nil {
log.Errorf("Error while initializing archiveBackend: %s", err.Error())
return
}
log.Infof("Load archive version %d", version)
err = initClusterConfig()
})
return err
}
func GetHandle() ArchiveBackend {
return ar
}
// Helper to metricdata.LoadAverages().
// Helper to metricdataloader.LoadAverages().
func LoadAveragesFromArchive(
job *schema.Job,
metrics []string,
@ -101,7 +112,7 @@ func LoadAveragesFromArchive(
) error {
metaFile, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return err
}
@ -116,10 +127,55 @@ func LoadAveragesFromArchive(
return nil
}
// Helper to metricdataloader.LoadJobStats().
func LoadStatsFromArchive(
job *schema.Job,
metrics []string,
) (map[string]schema.MetricStatistics, error) {
data := make(map[string]schema.MetricStatistics, len(metrics))
metaFile, err := ar.LoadJobMeta(job)
if err != nil {
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return data, err
}
for _, m := range metrics {
stat, ok := metaFile.Statistics[m]
if !ok {
data[m] = schema.MetricStatistics{Min: 0.0, Avg: 0.0, Max: 0.0}
continue
}
data[m] = schema.MetricStatistics{
Avg: stat.Avg,
Min: stat.Min,
Max: stat.Max,
}
}
return data, nil
}
// Helper to metricdataloader.LoadScopedJobStats().
func LoadScopedStatsFromArchive(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
) (schema.ScopedJobStats, error) {
data, err := ar.LoadJobStats(job)
if err != nil {
log.Errorf("Error while loading job stats from archiveBackend: %s", err.Error())
return nil, err
}
return data, nil
}
func GetStatistics(job *schema.Job) (map[string]schema.JobStatistics, error) {
metaFile, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return nil, err
}
@ -135,7 +191,7 @@ func UpdateMetadata(job *schema.Job, metadata map[string]string) error {
jobMeta, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return err
}
@ -155,15 +211,16 @@ func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
jobMeta, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return err
}
jobMeta.Tags = make([]*schema.Tag, 0)
for _, tag := range tags {
jobMeta.Tags = append(jobMeta.Tags, &schema.Tag{
Name: tag.Name,
Type: tag.Type,
Name: tag.Name,
Type: tag.Type,
Scope: tag.Scope,
})
}

View File

@ -12,13 +12,16 @@ import (
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var Clusters []*schema.Cluster
var nodeLists map[string]map[string]NodeList
var (
Clusters []*schema.Cluster
GlobalMetricList []*schema.GlobalMetricListItem
NodeLists map[string]map[string]NodeList
)
func initClusterConfig() error {
Clusters = []*schema.Cluster{}
nodeLists = map[string]map[string]NodeList{}
NodeLists = map[string]map[string]NodeList{}
metricLookup := make(map[string]schema.GlobalMetricListItem)
for _, c := range ar.GetClusters() {
@ -49,11 +52,79 @@ func initClusterConfig() error {
if !mc.Scope.Valid() {
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
}
ml, ok := metricLookup[mc.Name]
if !ok {
metricLookup[mc.Name] = schema.GlobalMetricListItem{
Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint,
}
ml = metricLookup[mc.Name]
}
availability := schema.ClusterSupport{Cluster: cluster.Name}
scLookup := make(map[string]*schema.SubClusterConfig)
for _, scc := range mc.SubClusters {
scLookup[scc.Name] = scc
}
for _, sc := range cluster.SubClusters {
newMetric := &schema.MetricConfig{
Unit: mc.Unit,
Energy: mc.Energy,
Name: mc.Name,
Scope: mc.Scope,
Aggregation: mc.Aggregation,
Peak: mc.Peak,
Caution: mc.Caution,
Alert: mc.Alert,
Timestep: mc.Timestep,
Normal: mc.Normal,
LowerIsBetter: mc.LowerIsBetter,
}
if mc.Footprint != "" {
newMetric.Footprint = mc.Footprint
}
if cfg, ok := scLookup[sc.Name]; ok {
if !cfg.Remove {
availability.SubClusters = append(availability.SubClusters, sc.Name)
newMetric.Peak = cfg.Peak
newMetric.Normal = cfg.Normal
newMetric.Caution = cfg.Caution
newMetric.Alert = cfg.Alert
newMetric.Footprint = cfg.Footprint
newMetric.Energy = cfg.Energy
newMetric.LowerIsBetter = cfg.LowerIsBetter
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
if newMetric.Footprint != "" {
sc.Footprint = append(sc.Footprint, newMetric.Name)
ml.Footprint = newMetric.Footprint
}
if newMetric.Energy != "" {
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
}
}
} else {
availability.SubClusters = append(availability.SubClusters, sc.Name)
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
if newMetric.Footprint != "" {
sc.Footprint = append(sc.Footprint, newMetric.Name)
}
if newMetric.Energy != "" {
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
}
}
}
ml.Availability = append(metricLookup[mc.Name].Availability, availability)
metricLookup[mc.Name] = ml
}
Clusters = append(Clusters, cluster)
nodeLists[cluster.Name] = make(map[string]NodeList)
NodeLists[cluster.Name] = make(map[string]NodeList)
for _, sc := range cluster.SubClusters {
if sc.Nodes == "*" {
continue
@ -63,15 +134,18 @@ func initClusterConfig() error {
if err != nil {
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > in %s/cluster.json: %w", cluster.Name, err)
}
nodeLists[cluster.Name][sc.Name] = nl
NodeLists[cluster.Name][sc.Name] = nl
}
}
for _, ml := range metricLookup {
GlobalMetricList = append(GlobalMetricList, &ml)
}
return nil
}
func GetCluster(cluster string) *schema.Cluster {
for _, c := range Clusters {
if c.Name == cluster {
return c
@ -90,11 +164,10 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
}
}
}
return nil, fmt.Errorf("Subcluster '%v' not found for cluster '%v', or cluster '%v' not configured!", subcluster, cluster, cluster)
return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster)
}
func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
for _, c := range Clusters {
if c.Name == cluster {
for _, m := range c.MetricConfig {
@ -110,7 +183,6 @@ func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
// AssignSubCluster sets the `job.subcluster` property of the job based
// on its cluster and resources.
func AssignSubCluster(job *schema.BaseJob) error {
cluster := GetCluster(job.Cluster)
if cluster == nil {
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster)
@ -130,7 +202,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
}
host0 := job.Resources[0].Hostname
for sc, nl := range nodeLists[job.Cluster] {
for sc, nl := range NodeLists[job.Cluster] {
if nl != nil && nl.Contains(host0) {
job.SubCluster = sc
return nil
@ -146,8 +218,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
}
func GetSubClusterByNode(cluster, hostname string) (string, error) {
for sc, nl := range nodeLists[cluster] {
for sc, nl := range NodeLists[cluster] {
if nl != nil && nl.Contains(hostname) {
return sc, nil
}
@ -164,3 +235,13 @@ func GetSubClusterByNode(cluster, hostname string) (string, error) {
return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", cluster, hostname)
}
func MetricIndex(mc []schema.MetricConfig, name string) (int, error) {
for i, m := range mc {
if m.Name == name {
return i, nil
}
}
return 0, fmt.Errorf("unknown metric name %s", name)
}

View File

@ -0,0 +1,39 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package archive_test
import (
"encoding/json"
"testing"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
)
func TestClusterConfig(t *testing.T) {
if err := archive.Init(json.RawMessage("{\"kind\": \"file\",\"path\": \"testdata/archive\"}"), false); err != nil {
t.Fatal(err)
}
sc, err := archive.GetSubCluster("fritz", "spr1tb")
if err != nil {
t.Fatal(err)
}
// spew.Dump(sc.MetricConfig)
if len(sc.Footprint) != 3 {
t.Fail()
}
if len(sc.MetricConfig) != 15 {
t.Fail()
}
for _, metric := range sc.MetricConfig {
if metric.LowerIsBetter && metric.Name != "mem_used" {
t.Fail()
}
}
// spew.Dump(archive.GlobalMetricList)
// t.Fail()
}

View File

@ -115,6 +115,40 @@ func loadJobData(filename string, isCompressed bool) (schema.JobData, error) {
}
}
func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, error) {
f, err := os.Open(filename)
if err != nil {
log.Errorf("fsBackend LoadJobStats()- %v", err)
return nil, err
}
defer f.Close()
if isCompressed {
r, err := gzip.NewReader(f)
if err != nil {
log.Errorf(" %v", err)
return nil, err
}
defer r.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, r); err != nil {
return nil, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobStats(r, filename)
} else {
if config.Keys.Validate {
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
return nil, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobStats(bufio.NewReader(f), filename)
}
}
func (fsa *FsArchive) Init(rawConfig json.RawMessage) (uint64, error) {
var config FsArchiveConfig
@ -389,6 +423,18 @@ func (fsa *FsArchive) LoadJobData(job *schema.Job) (schema.JobData, error) {
return loadJobData(filename, isCompressed)
}
func (fsa *FsArchive) LoadJobStats(job *schema.Job) (schema.ScopedJobStats, error) {
var isCompressed bool = true
filename := getPath(job, fsa.path, "data.json.gz")
if !util.CheckFileExists(filename) {
filename = getPath(job, fsa.path, "data.json")
isCompressed = false
}
return loadJobStats(filename, isCompressed)
}
func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) {
filename := getPath(job, fsa.path, "meta.json")
return loadJobMeta(filename)

View File

@ -30,6 +30,7 @@ func TestInitNoJson(t *testing.T) {
t.Fatal(err)
}
}
func TestInitNotExists(t *testing.T) {
var fsa FsArchive
_, err := fsa.Init(json.RawMessage("{\"path\":\"testdata/job-archive\"}"))
@ -47,10 +48,10 @@ func TestInit(t *testing.T) {
if fsa.path != "testdata/archive" {
t.Fail()
}
if version != 1 {
if version != 2 {
t.Fail()
}
if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" {
if len(fsa.clusters) != 3 || fsa.clusters[1] != "emmy" {
t.Fail()
}
}
@ -133,7 +134,6 @@ func TestLoadJobData(t *testing.T) {
}
func BenchmarkLoadJobData(b *testing.B) {
tmpdir := b.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
util.CopyDir("./testdata/archive/", jobarchive)
@ -157,7 +157,6 @@ func BenchmarkLoadJobData(b *testing.B) {
}
func BenchmarkLoadJobDataCompressed(b *testing.B) {
tmpdir := b.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
util.CopyDir("./testdata/archive/", jobarchive)

View File

@ -9,8 +9,8 @@ import (
"io"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func DecodeJobData(r io.Reader, k string) (schema.JobData, error) {
@ -32,6 +32,43 @@ func DecodeJobData(r io.Reader, k string) (schema.JobData, error) {
return data.(schema.JobData), nil
}
func DecodeJobStats(r io.Reader, k string) (schema.ScopedJobStats, error) {
jobData, err := DecodeJobData(r, k)
// Convert schema.JobData to schema.ScopedJobStats
if jobData != nil {
scopedJobStats := make(schema.ScopedJobStats)
for metric, metricData := range jobData {
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
for scope, jobMetric := range metricData {
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for _, series := range jobMetric.Series {
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: series.Hostname,
Id: series.Id,
Data: &series.Statistics,
})
}
// So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty
if len(scopedJobStats[metric][scope]) == 0 {
delete(scopedJobStats[metric], scope)
if len(scopedJobStats[metric]) == 0 {
delete(scopedJobStats, metric)
}
}
}
}
return scopedJobStats, nil
}
return nil, err
}
func DecodeJobMeta(r io.Reader) (*schema.JobMeta, error) {
var d schema.JobMeta
if err := json.NewDecoder(r).Decode(&d); err != nil {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1 +1 @@
1
2

View File

@ -46,12 +46,12 @@ var loglevel string = "info"
/* CONFIG */
func Init(lvl string, logdate bool) {
// Discard I/O for all writers below selected loglevel; <CRITICAL> is always written.
switch lvl {
case "crit":
ErrWriter = io.Discard
fallthrough
case "err", "fatal":
case "err":
WarnWriter = io.Discard
fallthrough
case "warn":
@ -63,8 +63,7 @@ func Init(lvl string, logdate bool) {
// Nothing to do...
break
default:
fmt.Printf("pkg/log: Flag 'loglevel' has invalid value %#v\npkg/log: Will use default loglevel 'debug'\n", lvl)
//SetLogLevel("debug")
fmt.Printf("pkg/log: Flag 'loglevel' has invalid value %#v\npkg/log: Will use default loglevel '%s'\n", lvl, loglevel)
}
if !logdate {
@ -84,109 +83,138 @@ func Init(lvl string, logdate bool) {
loglevel = lvl
}
/* PRINT */
// Private helper
func printStr(v ...interface{}) string {
return fmt.Sprint(v...)
}
// Uses Info() -> If errorpath required at some point:
// Will need own writer with 'Output(2, out)' to correctly render path
func Print(v ...interface{}) {
Info(v...)
}
func Debug(v ...interface{}) {
DebugLog.Output(2, printStr(v...))
}
func Info(v ...interface{}) {
InfoLog.Output(2, printStr(v...))
}
func Warn(v ...interface{}) {
WarnLog.Output(2, printStr(v...))
}
func Error(v ...interface{}) {
ErrLog.Output(2, printStr(v...))
}
// Writes panic stacktrace, but keeps application alive
func Panic(v ...interface{}) {
ErrLog.Output(2, printStr(v...))
panic("Panic triggered ...")
}
func Crit(v ...interface{}) {
CritLog.Output(2, printStr(v...))
}
// Writes critical log, stops application
func Fatal(v ...interface{}) {
CritLog.Output(2, printStr(v...))
os.Exit(1)
}
/* PRINT FORMAT*/
// Private helper
func printfStr(format string, v ...interface{}) string {
return fmt.Sprintf(format, v...)
}
// Uses Infof() -> If errorpath required at some point:
// Will need own writer with 'Output(2, out)' to correctly render path
func Printf(format string, v ...interface{}) {
Infof(format, v...)
}
func Debugf(format string, v ...interface{}) {
DebugLog.Output(2, printfStr(format, v...))
}
func Infof(format string, v ...interface{}) {
InfoLog.Output(2, printfStr(format, v...))
}
func Warnf(format string, v ...interface{}) {
WarnLog.Output(2, printfStr(format, v...))
}
func Errorf(format string, v ...interface{}) {
ErrLog.Output(2, printfStr(format, v...))
}
// Writes panic stacktrace, but keeps application alive
func Panicf(format string, v ...interface{}) {
ErrLog.Output(2, printfStr(format, v...))
panic("Panic triggered ...")
}
func Critf(format string, v ...interface{}) {
CritLog.Output(2, printfStr(format, v...))
}
// Writes crit log, stops application
func Fatalf(format string, v ...interface{}) {
CritLog.Output(2, printfStr(format, v...))
os.Exit(1)
}
/* HELPER */
func Loglevel() string {
return loglevel
}
/* SPECIAL */
/* PRIVATE HELPER */
// func Finfof(w io.Writer, format string, v ...interface{}) {
// if w != io.Discard {
// if logDateTime {
// currentTime := time.Now()
// fmt.Fprintf(InfoWriter, currentTime.String()+InfoPrefix+format+"\n", v...)
// } else {
// fmt.Fprintf(InfoWriter, InfoPrefix+format+"\n", v...)
// }
// }
// }
// Return unformatted string
func printStr(v ...interface{}) string {
return fmt.Sprint(v...)
}
// Return formatted string
func printfStr(format string, v ...interface{}) string {
return fmt.Sprintf(format, v...)
}
/* PRINT */
// Prints to STDOUT without string formatting; application continues.
// Used for special cases not requiring log information like date or location.
func Print(v ...interface{}) {
fmt.Fprintln(os.Stdout, v...)
}
// Prints to STDOUT without string formatting; application exits with error code 0.
// Used for exiting succesfully with message after expected outcome, e.g. successful single-call application runs.
func Exit(v ...interface{}) {
fmt.Fprintln(os.Stdout, v...)
os.Exit(0)
}
// Prints to STDOUT without string formatting; application exits with error code 1.
// Used for terminating with message after to be expected errors, e.g. wrong arguments or during init().
func Abort(v ...interface{}) {
fmt.Fprintln(os.Stdout, v...)
os.Exit(1)
}
// Prints to DEBUG writer without string formatting; application continues.
// Used for logging additional information, primarily for development.
func Debug(v ...interface{}) {
DebugLog.Output(2, printStr(v...))
}
// Prints to INFO writer without string formatting; application continues.
// Used for logging additional information, e.g. notable returns or common fail-cases.
func Info(v ...interface{}) {
InfoLog.Output(2, printStr(v...))
}
// Prints to WARNING writer without string formatting; application continues.
// Used for logging important information, e.g. uncommon edge-cases or administration related information.
func Warn(v ...interface{}) {
WarnLog.Output(2, printStr(v...))
}
// Prints to ERROR writer without string formatting; application continues.
// Used for logging errors, but code still can return default(s) or nil.
func Error(v ...interface{}) {
ErrLog.Output(2, printStr(v...))
}
// Prints to CRITICAL writer without string formatting; application exits with error code 1.
// Used for terminating on unexpected errors with date and code location.
func Fatal(v ...interface{}) {
CritLog.Output(2, printStr(v...))
os.Exit(1)
}
// Prints to PANIC function without string formatting; application exits with panic.
// Used for terminating on unexpected errors with stacktrace.
func Panic(v ...interface{}) {
panic(printStr(v...))
}
/* PRINT FORMAT*/
// Prints to STDOUT with string formatting; application continues.
// Used for special cases not requiring log information like date or location.
func Printf(format string, v ...interface{}) {
fmt.Fprintf(os.Stdout, format, v...)
}
// Prints to STDOUT with string formatting; application exits with error code 0.
// Used for exiting succesfully with message after expected outcome, e.g. successful single-call application runs.
func Exitf(format string, v ...interface{}) {
fmt.Fprintf(os.Stdout, format, v...)
os.Exit(0)
}
// Prints to STDOUT with string formatting; application exits with error code 1.
// Used for terminating with message after to be expected errors, e.g. wrong arguments or during init().
func Abortf(format string, v ...interface{}) {
fmt.Fprintf(os.Stdout, format, v...)
os.Exit(1)
}
// Prints to DEBUG writer with string formatting; application continues.
// Used for logging additional information, primarily for development.
func Debugf(format string, v ...interface{}) {
DebugLog.Output(2, printfStr(format, v...))
}
// Prints to INFO writer with string formatting; application continues.
// Used for logging additional information, e.g. notable returns or common fail-cases.
func Infof(format string, v ...interface{}) {
InfoLog.Output(2, printfStr(format, v...))
}
// Prints to WARNING writer with string formatting; application continues.
// Used for logging important information, e.g. uncommon edge-cases or administration related information.
func Warnf(format string, v ...interface{}) {
WarnLog.Output(2, printfStr(format, v...))
}
// Prints to ERROR writer with string formatting; application continues.
// Used for logging errors, but code still can return default(s) or nil.
func Errorf(format string, v ...interface{}) {
ErrLog.Output(2, printfStr(format, v...))
}
// Prints to CRITICAL writer with string formatting; application exits with error code 1.
// Used for terminating on unexpected errors with date and code location.
func Fatalf(format string, v ...interface{}) {
CritLog.Output(2, printfStr(format, v...))
os.Exit(1)
}
// Prints to PANIC function with string formatting; application exits with panic.
// Used for terminating on unexpected errors with stacktrace.
func Panicf(format string, v ...interface{}) {
panic(printfStr(format, v...))
}

123
pkg/resampler/resampler.go Normal file
View File

@ -0,0 +1,123 @@
package resampler
import (
"errors"
"fmt"
"math"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func SimpleResampler(data []schema.Float, old_frequency int64, new_frequency int64) ([]schema.Float, int64, error) {
if old_frequency == 0 || new_frequency == 0 || new_frequency <= old_frequency {
return data, old_frequency, nil
}
if new_frequency%old_frequency != 0 {
return nil, 0, errors.New("new sampling frequency should be multiple of the old frequency")
}
var step int = int(new_frequency / old_frequency)
var new_data_length = len(data) / step
if new_data_length == 0 || len(data) < 100 || new_data_length >= len(data) {
return data, old_frequency, nil
}
new_data := make([]schema.Float, new_data_length)
for i := 0; i < new_data_length; i++ {
new_data[i] = data[i*step]
}
return new_data, new_frequency, nil
}
// Inspired by one of the algorithms from https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf
// Adapted from https://github.com/haoel/downsampling/blob/master/core/lttb.go
func LargestTriangleThreeBucket(data []schema.Float, old_frequency int, new_frequency int) ([]schema.Float, int, error) {
if old_frequency == 0 || new_frequency == 0 || new_frequency <= old_frequency {
return data, old_frequency, nil
}
if new_frequency%old_frequency != 0 {
return nil, 0, errors.New(fmt.Sprintf("new sampling frequency : %d should be multiple of the old frequency : %d", new_frequency, old_frequency))
}
var step int = int(new_frequency / old_frequency)
var new_data_length = len(data) / step
if new_data_length == 0 || len(data) < 100 || new_data_length >= len(data) {
return data, old_frequency, nil
}
new_data := make([]schema.Float, 0, new_data_length)
// Bucket size. Leave room for start and end data points
bucketSize := float64(len(data)-2) / float64(new_data_length-2)
new_data = append(new_data, data[0]) // Always add the first point
// We have 3 pointers represent for
// > bucketLow - the current bucket's beginning location
// > bucketMiddle - the current bucket's ending location,
// also the beginning location of next bucket
// > bucketHight - the next bucket's ending location.
bucketLow := 1
bucketMiddle := int(math.Floor(bucketSize)) + 1
var prevMaxAreaPoint int
for i := 0; i < new_data_length-2; i++ {
bucketHigh := int(math.Floor(float64(i+2)*bucketSize)) + 1
if bucketHigh >= len(data)-1 {
bucketHigh = len(data) - 2
}
// Calculate point average for next bucket (containing c)
avgPointX, avgPointY := calculateAverageDataPoint(data[bucketMiddle:bucketHigh+1], int64(bucketMiddle))
// Get the range for current bucket
currBucketStart := bucketLow
currBucketEnd := bucketMiddle
// Point a
pointX := prevMaxAreaPoint
pointY := data[prevMaxAreaPoint]
maxArea := -1.0
var maxAreaPoint int
flag_ := 0
for ; currBucketStart < currBucketEnd; currBucketStart++ {
area := calculateTriangleArea(schema.Float(pointX), pointY, avgPointX, avgPointY, schema.Float(currBucketStart), data[currBucketStart])
if area > maxArea {
maxArea = area
maxAreaPoint = currBucketStart
}
if math.IsNaN(float64(avgPointY)) {
flag_ = 1
}
}
if flag_ == 1 {
new_data = append(new_data, schema.NaN) // Pick this point from the bucket
} else {
new_data = append(new_data, data[maxAreaPoint]) // Pick this point from the bucket
}
prevMaxAreaPoint = maxAreaPoint // This MaxArea point is the next's prevMAxAreaPoint
//move to the next window
bucketLow = bucketMiddle
bucketMiddle = bucketHigh
}
new_data = append(new_data, data[len(data)-1]) // Always add last
return new_data, new_frequency, nil
}

35
pkg/resampler/util.go Normal file
View File

@ -0,0 +1,35 @@
package resampler
import (
"math"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func calculateTriangleArea(paX, paY, pbX, pbY, pcX, pcY schema.Float) float64 {
area := ((paX-pcX)*(pbY-paY) - (paX-pbX)*(pcY-paY)) * 0.5
return math.Abs(float64(area))
}
func calculateAverageDataPoint(points []schema.Float, xStart int64) (avgX schema.Float, avgY schema.Float) {
flag := 0
for _, point := range points {
avgX += schema.Float(xStart)
avgY += point
xStart++
if math.IsNaN(float64(point)) {
flag = 1
}
}
l := schema.Float(len(points))
avgX /= l
avgY /= l
if flag == 1 {
return avgX, schema.NaN
} else {
return avgX, avgY
}
}

View File

@ -5,85 +5,16 @@
package runtimeEnv
import (
"bufio"
"errors"
"fmt"
"os"
"os/exec"
"os/user"
"strconv"
"strings"
"syscall"
"github.com/ClusterCockpit/cc-backend/pkg/log"
)
// Very simple and limited .env file reader.
// All variable definitions found are directly
// added to the processes environment.
func LoadEnv(file string) error {
f, err := os.Open(file)
if err != nil {
log.Error("Error while opening .env file")
return err
}
defer f.Close()
s := bufio.NewScanner(bufio.NewReader(f))
for s.Scan() {
line := s.Text()
if strings.HasPrefix(line, "#") || len(line) == 0 {
continue
}
if strings.Contains(line, "#") {
return errors.New("'#' are only supported at the start of a line")
}
line = strings.TrimPrefix(line, "export ")
parts := strings.SplitN(line, "=", 2)
if len(parts) != 2 {
return fmt.Errorf("RUNTIME/SETUP > unsupported line: %#v", line)
}
key := strings.TrimSpace(parts[0])
val := strings.TrimSpace(parts[1])
if strings.HasPrefix(val, "\"") {
if !strings.HasSuffix(val, "\"") {
return fmt.Errorf("RUNTIME/SETUP > unsupported line: %#v", line)
}
runes := []rune(val[1 : len(val)-1])
sb := strings.Builder{}
for i := 0; i < len(runes); i++ {
if runes[i] == '\\' {
i++
switch runes[i] {
case 'n':
sb.WriteRune('\n')
case 'r':
sb.WriteRune('\r')
case 't':
sb.WriteRune('\t')
case '"':
sb.WriteRune('"')
default:
return fmt.Errorf("RUNTIME/SETUP > unsupported escape sequence in quoted string: backslash %#v", runes[i])
}
continue
}
sb.WriteRune(runes[i])
}
val = sb.String()
}
os.Setenv(key, val)
}
return s.Err()
}
// Changes the processes user and group to that
// specified in the config.json. The go runtime
// takes care of all threads (and not only the calling one)

View File

@ -30,38 +30,47 @@ type MetricValue struct {
}
type SubCluster struct {
Name string `json:"name"`
Nodes string `json:"nodes"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar MetricValue `json:"flopRateScalar"`
FlopRateSimd MetricValue `json:"flopRateSimd"`
MemoryBandwidth MetricValue `json:"memoryBandwidth"`
Topology Topology `json:"topology"`
Name string `json:"name"`
Nodes string `json:"nodes"`
ProcessorType string `json:"processorType"`
Topology Topology `json:"topology"`
FlopRateScalar MetricValue `json:"flopRateScalar"`
FlopRateSimd MetricValue `json:"flopRateSimd"`
MemoryBandwidth MetricValue `json:"memoryBandwidth"`
MetricConfig []MetricConfig `json:"metricConfig,omitempty"`
Footprint []string `json:"footprint,omitempty"`
EnergyFootprint []string `json:"energyFootprint,omitempty"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
}
type SubClusterConfig struct {
Name string `json:"name"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
Remove bool `json:"remove"`
Name string `json:"name"`
Footprint string `json:"footprint,omitempty"`
Energy string `json:"energy"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
Remove bool `json:"remove"`
LowerIsBetter bool `json:"lowerIsBetter"`
}
type MetricConfig struct {
Name string `json:"name"`
Unit Unit `json:"unit"`
Scope MetricScope `json:"scope"`
Aggregation string `json:"aggregation"`
Timestep int `json:"timestep"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
Unit Unit `json:"unit"`
Energy string `json:"energy"`
Name string `json:"name"`
Scope MetricScope `json:"scope"`
Aggregation string `json:"aggregation"`
Footprint string `json:"footprint,omitempty"`
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
Peak float64 `json:"peak"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
Timestep int `json:"timestep"`
Normal float64 `json:"normal"`
LowerIsBetter bool `json:"lowerIsBetter"`
}
type Cluster struct {
@ -70,14 +79,27 @@ type Cluster struct {
SubClusters []*SubCluster `json:"subClusters"`
}
type ClusterSupport struct {
Cluster string `json:"cluster"`
SubClusters []string `json:"subclusters"`
}
type GlobalMetricListItem struct {
Name string `json:"name"`
Unit Unit `json:"unit"`
Scope MetricScope `json:"scope"`
Footprint string `json:"footprint,omitempty"`
Availability []ClusterSupport `json:"availability"`
}
// Return a list of socket IDs given a list of hwthread IDs. Even if just one
// hwthread is in that socket, add it to the list. If no hwthreads other than
// those in the argument list are assigned to one of the sockets in the first
// return value, return true as the second value. TODO: Optimize this, there
// must be a more efficient way/algorithm.
func (topo *Topology) GetSocketsFromHWThreads(
hwthreads []int) (sockets []int, exclusive bool) {
hwthreads []int,
) (sockets []int, exclusive bool) {
socketsMap := map[int]int{}
for _, hwthread := range hwthreads {
for socket, hwthreadsInSocket := range topo.Socket {
@ -100,14 +122,46 @@ func (topo *Topology) GetSocketsFromHWThreads(
return sockets, exclusive
}
// Return a list of socket IDs given a list of core IDs. Even if just one
// core is in that socket, add it to the list. If no cores other than
// those in the argument list are assigned to one of the sockets in the first
// return value, return true as the second value. TODO: Optimize this, there
// must be a more efficient way/algorithm.
func (topo *Topology) GetSocketsFromCores (
cores []int,
) (sockets []int, exclusive bool) {
socketsMap := map[int]int{}
for _, core := range cores {
for _, hwthreadInCore := range topo.Core[core] {
for socket, hwthreadsInSocket := range topo.Socket {
for _, hwthreadInSocket := range hwthreadsInSocket {
if hwthreadInCore == hwthreadInSocket {
socketsMap[socket] += 1
}
}
}
}
}
exclusive = true
hwthreadsPerSocket := len(topo.Node) / len(topo.Socket)
sockets = make([]int, 0, len(socketsMap))
for socket, count := range socketsMap {
sockets = append(sockets, socket)
exclusive = exclusive && count == hwthreadsPerSocket
}
return sockets, exclusive
}
// Return a list of core IDs given a list of hwthread IDs. Even if just one
// hwthread is in that core, add it to the list. If no hwthreads other than
// those in the argument list are assigned to one of the cores in the first
// return value, return true as the second value. TODO: Optimize this, there
// must be a more efficient way/algorithm.
func (topo *Topology) GetCoresFromHWThreads(
hwthreads []int) (cores []int, exclusive bool) {
hwthreads []int,
) (cores []int, exclusive bool) {
coresMap := map[int]int{}
for _, hwthread := range hwthreads {
for core, hwthreadsInCore := range topo.Core {
@ -136,8 +190,8 @@ func (topo *Topology) GetCoresFromHWThreads(
// memory domains in the first return value, return true as the second value.
// TODO: Optimize this, there must be a more efficient way/algorithm.
func (topo *Topology) GetMemoryDomainsFromHWThreads(
hwthreads []int) (memDoms []int, exclusive bool) {
hwthreads []int,
) (memDoms []int, exclusive bool) {
memDomsMap := map[int]int{}
for _, hwthread := range hwthreads {
for memDom, hwthreadsInmemDom := range topo.MemoryDomain {
@ -172,7 +226,17 @@ func (topo *Topology) GetAcceleratorID(id int) (string, error) {
}
}
func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
// Return list of hardware (string) accelerator IDs
func (topo *Topology) GetAcceleratorIDs() []string {
accels := make([]string, 0)
for _, accel := range topo.Accelerators {
accels = append(accels, accel.ID)
}
return accels
}
// Outdated? Or: Return indices of accelerators in parent array?
func (topo *Topology) GetAcceleratorIDsAsInt() ([]int, error) {
accels := make([]int, 0)
for _, accel := range topo.Accelerators {
id, err := strconv.Atoi(accel.ID)

View File

@ -24,8 +24,9 @@ type LdapConfig struct {
}
type OpenIDConfig struct {
Provider string `json:"provider"`
SyncUserOnLogin bool `json:"syncUserOnLogin"`
Provider string `json:"provider"`
SyncUserOnLogin bool `json:"syncUserOnLogin"`
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
}
type JWTAuthConfig struct {
@ -45,6 +46,9 @@ type JWTAuthConfig struct {
// Should an non-existent user be added to the DB based on the information in the token
SyncUserOnLogin bool `json:"syncUserOnLogin"`
// Should an existent user be updated in the DB based on the information in the token
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
}
type IntRange struct {
@ -53,8 +57,9 @@ type IntRange struct {
}
type TimeRange struct {
From *time.Time `json:"from"`
To *time.Time `json:"to"`
From *time.Time `json:"from"`
To *time.Time `json:"to"`
Range string `json:"range,omitempty"`
}
type FilterRanges struct {
@ -76,12 +81,26 @@ type Retention struct {
IncludeDB bool `json:"includeDB"`
}
type ResampleConfig struct {
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
Resolutions []int `json:"resolutions"`
// Trigger next zoom level at less than this many visible datapoints
Trigger int `json:"trigger"`
}
type CronFrequency struct {
// Duration Update Worker [Defaults to '5m']
DurationWorker string `json:"duration-worker"`
// Metric-Footprint Update Worker [Defaults to '10m']
FootprintWorker string `json:"footprint-worker"`
}
// Format of the configuration (file). See below for the defaults.
type ProgramConfig struct {
// Address where the http (or https) server will listen on (for example: 'localhost:80').
Addr string `json:"addr"`
// Addresses from which secured API endpoints can be reached
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
ApiAllowedIPs []string `json:"apiAllowedIPs"`
// Drop root permissions once .env was read and the port was taken.
@ -133,6 +152,9 @@ type ProgramConfig struct {
// be provided! Most options here can be overwritten by the user.
UiDefaults map[string]interface{} `json:"ui-defaults"`
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"enable-resampling"`
// Where to store MachineState files
MachineStateDir string `json:"machine-state-dir"`
@ -142,6 +164,13 @@ type ProgramConfig struct {
// Defines time X in seconds in which jobs are considered to be "short" and will be filtered in specific views.
ShortRunningJobsDuration int `json:"short-running-jobs-duration"`
// Energy Mix CO2 Emission Constant [g/kWh]
// If entered, displays estimated CO2 emission for job based on jobs totalEnergy
EmissionConstant int `json:"emission-constant"`
// Frequency of cron job workers
CronFrequency *CronFrequency `json:"cron-frequency"`
// Array of Clusters
Clusters []*ClusterConfig `json:"clusters"`
}

View File

@ -16,30 +16,33 @@ import (
// Common subset of Job and JobMeta. Use one of those, not this type directly.
type BaseJob struct {
// The unique identifier of a job
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
User string `json:"user" db:"user" example:"abcd100h"` // The unique identifier of a user
Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project
Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster
SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster
Partition string `json:"partition,omitempty" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` // The unique identifier of an array job
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0)
// NumCores int32 `json:"numCores" db:"num_cores" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` // SMT threads used by job
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0)
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
Tags []*Tag `json:"tags,omitempty"` // List of tags
RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes]
Resources []*Resource `json:"resources"` // Resources used by job
RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes]
MetaData map[string]string `json:"metaData"` // Additional information about the job
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
Cluster string `json:"cluster" db:"cluster" example:"fritz"`
SubCluster string `json:"subCluster" db:"subcluster" example:"main"`
Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"`
Project string `json:"project" db:"project" example:"abcd200"`
User string `json:"user" db:"hpc_user" example:"abcd100h"`
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"`
Tags []*Tag `json:"tags,omitempty"`
RawEnergyFootprint []byte `json:"-" db:"energy_footprint"`
RawFootprint []byte `json:"-" db:"footprint"`
RawMetaData []byte `json:"-" db:"meta_data"`
RawResources []byte `json:"-" db:"resources"`
Resources []*Resource `json:"resources"`
EnergyFootprint map[string]float64 `json:"energyFootprint"`
Footprint map[string]float64 `json:"footprint"`
MetaData map[string]string `json:"metaData"`
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
Energy float64 `json:"energy" db:"energy"`
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"`
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"`
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"`
SMT int32 `json:"smt,omitempty" db:"smt" example:"4"`
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"`
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"`
NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"`
NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"`
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"`
}
// Job struct type
@ -49,19 +52,10 @@ type BaseJob struct {
// Job model
// @Description Information of a HPC job.
type Job struct {
// The unique identifier of a job in the database
ID int64 `json:"id" db:"id"`
StartTime time.Time `json:"startTime"`
BaseJob
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds
StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type
MemUsedMax float64 `json:"memUsedMax" db:"mem_used_max"` // MemUsedMax as Float64
FlopsAnyAvg float64 `json:"flopsAnyAvg" db:"flops_any_avg"` // FlopsAnyAvg as Float64
MemBwAvg float64 `json:"memBwAvg" db:"mem_bw_avg"` // MemBwAvg as Float64
LoadAvg float64 `json:"loadAvg" db:"load_avg"` // LoadAvg as Float64
NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64
FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"` // FileDataVolTotal as Float64
ID int64 `json:"id" db:"id"`
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"`
}
// JobMeta struct type
@ -88,11 +82,10 @@ type JobLinkResultList struct {
// JobMeta model
// @Description Meta data information of a HPC job.
type JobMeta struct {
// The unique identifier of a job in the database
ID *int64 `json:"id,omitempty"`
ID *int64 `json:"id,omitempty"`
Statistics map[string]JobStatistics `json:"statistics"`
BaseJob
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` // Start epoch time stamp in seconds (Min > 0)
Statistics map[string]JobStatistics `json:"statistics"` // Metric statistics of job
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"`
}
const (
@ -124,18 +117,19 @@ type JobStatistics struct {
// Tag model
// @Description Defines a tag using name and type.
type Tag struct {
ID int64 `json:"id" db:"id"` // The unique DB identifier of a tag
Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type
Name string `json:"name" db:"tag_name" example:"Testjob"` // Tag Name
Type string `json:"type" db:"tag_type" example:"Debug"`
Name string `json:"name" db:"tag_name" example:"Testjob"`
Scope string `json:"scope" db:"tag_scope" example:"global"`
ID int64 `json:"id" db:"id"`
}
// Resource model
// @Description A resource used by a job
type Resource struct {
Hostname string `json:"hostname"` // Name of the host (= node)
HWThreads []int `json:"hwthreads,omitempty"` // List of OS processor ids
Accelerators []string `json:"accelerators,omitempty"` // List of of accelerator device ids
Configuration string `json:"configuration,omitempty"` // The configuration options of the node
Hostname string `json:"hostname"`
Configuration string `json:"configuration,omitempty"`
HWThreads []int `json:"hwthreads,omitempty"`
Accelerators []string `json:"accelerators,omitempty"`
}
type JobState string

View File

@ -10,22 +10,31 @@ import (
"math"
"sort"
"unsafe"
"github.com/ClusterCockpit/cc-backend/internal/util"
)
type JobData map[string]map[MetricScope]*JobMetric
type ScopedJobStats map[string]map[MetricScope][]*ScopedStats
type JobMetric struct {
Unit Unit `json:"unit"`
Timestep int `json:"timestep"`
Series []Series `json:"series"`
StatisticsSeries *StatsSeries `json:"statisticsSeries,omitempty"`
Unit Unit `json:"unit"`
Series []Series `json:"series"`
Timestep int `json:"timestep"`
}
type Series struct {
Hostname string `json:"hostname"`
Id *string `json:"id,omitempty"`
Statistics MetricStatistics `json:"statistics"`
Hostname string `json:"hostname"`
Data []Float `json:"data"`
Statistics MetricStatistics `json:"statistics"`
}
type ScopedStats struct {
Hostname string `json:"hostname"`
Id *string `json:"id,omitempty"`
Data *MetricStatistics `json:"data"`
}
type MetricStatistics struct {
@ -35,10 +44,11 @@ type MetricStatistics struct {
}
type StatsSeries struct {
Percentiles map[int][]Float `json:"percentiles,omitempty"`
Mean []Float `json:"mean"`
Median []Float `json:"median"`
Min []Float `json:"min"`
Max []Float `json:"max"`
Percentiles map[int][]Float `json:"percentiles,omitempty"`
}
type MetricScope string
@ -121,6 +131,7 @@ func (jd *JobData) Size() int {
if metric.StatisticsSeries != nil {
n += len(metric.StatisticsSeries.Max)
n += len(metric.StatisticsSeries.Mean)
n += len(metric.StatisticsSeries.Median)
n += len(metric.StatisticsSeries.Min)
}
@ -149,53 +160,74 @@ func (jm *JobMetric) AddStatisticsSeries() {
}
}
min, mean, max := make([]Float, n), make([]Float, n), make([]Float, n)
// mean := make([]Float, n)
min, median, max := make([]Float, n), make([]Float, n), make([]Float, n)
i := 0
for ; i < m; i++ {
smin, ssum, smax := math.MaxFloat32, 0.0, -math.MaxFloat32
seriesCount := len(jm.Series)
// ssum := 0.0
smin, smed, smax := math.MaxFloat32, make([]float64, seriesCount), -math.MaxFloat32
notnan := 0
for j := 0; j < len(jm.Series); j++ {
for j := 0; j < seriesCount; j++ {
x := float64(jm.Series[j].Data[i])
if math.IsNaN(x) {
continue
}
notnan += 1
ssum += x
// ssum += x
smed[j] = x
smin = math.Min(smin, x)
smax = math.Max(smax, x)
}
if notnan < 3 {
min[i] = NaN
mean[i] = NaN
// mean[i] = NaN
median[i] = NaN
max[i] = NaN
} else {
min[i] = Float(smin)
mean[i] = Float(ssum / float64(notnan))
// mean[i] = Float(ssum / float64(notnan))
max[i] = Float(smax)
medianRaw, err := util.Median(smed)
if err != nil {
median[i] = NaN
} else {
median[i] = Float(medianRaw)
}
}
}
for ; i < n; i++ {
min[i] = NaN
mean[i] = NaN
// mean[i] = NaN
median[i] = NaN
max[i] = NaN
}
if smooth {
for i := 2; i < len(mean)-2; i++ {
for i := 2; i < len(median)-2; i++ {
if min[i].IsNaN() {
continue
}
min[i] = (min[i-2] + min[i-1] + min[i] + min[i+1] + min[i+2]) / 5
max[i] = (max[i-2] + max[i-1] + max[i] + max[i+1] + max[i+2]) / 5
mean[i] = (mean[i-2] + mean[i-1] + mean[i] + mean[i+1] + mean[i+2]) / 5
// mean[i] = (mean[i-2] + mean[i-1] + mean[i] + mean[i+1] + mean[i+2]) / 5
// Reduce Median further
smoothRaw := []float64{float64(median[i-2]), float64(median[i-1]), float64(median[i]), float64(median[i+1]), float64(median[i+2])}
smoothMedian, err := util.Median(smoothRaw)
if err != nil {
median[i] = NaN
} else {
median[i] = Float(smoothMedian)
}
}
}
jm.StatisticsSeries = &StatsSeries{Mean: mean, Min: min, Max: max}
jm.StatisticsSeries = &StatsSeries{Median: median, Min: min, Max: max} // Mean: mean
}
func (jd *JobData) AddNodeScope(metric string) bool {
@ -204,7 +236,7 @@ func (jd *JobData) AddNodeScope(metric string) bool {
return false
}
var maxScope MetricScope = MetricScopeInvalid
maxScope := MetricScopeInvalid
for scope := range scopes {
maxScope = maxScope.Max(scope)
}
@ -266,6 +298,21 @@ func (jd *JobData) AddNodeScope(metric string) bool {
return true
}
func (jd *JobData) RoundMetricStats() {
// TODO: Make Digit-Precision Configurable? (Currently: Fixed to 2 Digits)
for _, scopes := range *jd {
for _, jm := range scopes {
for index := range jm.Series {
jm.Series[index].Statistics = MetricStatistics{
Avg: (math.Round(jm.Series[index].Statistics.Avg*100) / 100),
Min: (math.Round(jm.Series[index].Statistics.Min*100) / 100),
Max: (math.Round(jm.Series[index].Statistics.Max*100) / 100),
}
}
}
}
}
func (jm *JobMetric) AddPercentiles(ps []int) bool {
if jm.StatisticsSeries == nil {
jm.AddStatisticsSeries()

View File

@ -1,284 +1,339 @@
{
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://cluster.schema.json",
"title": "HPC cluster description",
"description": "Meta data information of a HPC cluster",
"type": "object",
"properties": {
"name": {
"description": "The unique identifier of a cluster",
"type": "string"
},
"metricConfig": {
"description": "Metric specifications",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Metric name",
"type": "string"
},
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"scope": {
"description": "Native measurement resolution",
"type": "string"
},
"timestep": {
"description": "Frequency of timeseries points",
"type": "integer"
},
"aggregation": {
"description": "How the metric is aggregated",
"type": "string",
"enum": [
"sum",
"avg"
]
},
"peak": {
"description": "Metric peak threshold (Upper metric limit)",
"type": "number"
},
"normal": {
"description": "Metric normal threshold",
"type": "number"
},
"caution": {
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
"type": "number"
},
"alert": {
"description": "Metric alert threshold (Requires immediate action)",
"type": "number"
},
"subClusters": {
"description": "Array of cluster hardware partition metric thresholds",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
},
"peak": {
"type": "number"
},
"normal": {
"type": "number"
},
"caution": {
"type": "number"
},
"alert": {
"type": "number"
},
"remove": {
"type": "boolean"
}
},
"required": [
"name"
]
}
}
},
"required": [
"name",
"unit",
"scope",
"timestep",
"aggregation",
"peak",
"normal",
"caution",
"alert"
]
},
"minItems": 1
},
"subClusters": {
"description": "Array of cluster hardware partitions",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
},
"processorType": {
"description": "Processor type",
"type": "string"
},
"socketsPerNode": {
"description": "Number of sockets per node",
"type": "integer"
},
"coresPerSocket": {
"description": "Number of cores per socket",
"type": "integer"
},
"threadsPerCore": {
"description": "Number of SMT threads per core",
"type": "integer"
},
"flopRateScalar": {
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"flopRateSimd": {
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"memoryBandwidth": {
"description": "Theoretical node peak memory bandwidth in GB/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"nodes": {
"description": "Node list expression",
"type": "string"
},
"topology": {
"description": "Node topology",
"type": "object",
"properties": {
"node": {
"description": "HwTread lists of node",
"type": "array",
"items": {
"type": "integer"
}
},
"socket": {
"description": "HwTread lists of sockets",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"memoryDomain": {
"description": "HwTread lists of memory domains",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"die": {
"description": "HwTread lists of dies",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"core": {
"description": "HwTread lists of cores",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"accelerators": {
"type": "array",
"description": "List of of accelerator devices",
"items": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The unique device id"
},
"type": {
"type": "string",
"description": "The accelerator type",
"enum": [
"Nvidia GPU",
"AMD GPU",
"Intel GPU"
]
},
"model": {
"type": "string",
"description": "The accelerator model"
}
},
"required": [
"id",
"type",
"model"
]
}
}
},
"required": [
"node",
"socket",
"memoryDomain"
]
}
},
"required": [
"name",
"nodes",
"topology",
"processorType",
"socketsPerNode",
"coresPerSocket",
"threadsPerCore",
"flopRateScalar",
"flopRateSimd",
"memoryBandwidth"
]
},
"minItems": 1
}
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://cluster.schema.json",
"title": "HPC cluster description",
"description": "Meta data information of a HPC cluster",
"type": "object",
"properties": {
"name": {
"description": "The unique identifier of a cluster",
"type": "string"
},
"required": [
"name",
"metricConfig",
"subClusters"
]
"metricConfig": {
"description": "Metric specifications",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Metric name",
"type": "string"
},
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"scope": {
"description": "Native measurement resolution",
"type": "string",
"enum": [
"node",
"socket",
"memoryDomain",
"core",
"hwthread",
"accelerator"
]
},
"timestep": {
"description": "Frequency of timeseries points in seconds",
"type": "integer"
},
"aggregation": {
"description": "How the metric is aggregated",
"type": "string",
"enum": [
"sum",
"avg"
]
},
"footprint": {
"description": "Is it a footprint metric and what type",
"type": "string",
"enum": [
"avg",
"max",
"min"
]
},
"energy": {
"description": "Is it used to calculate job energy",
"type": "string",
"enum": [
"power",
"energy"
]
},
"lowerIsBetter": {
"description": "Is lower better.",
"type": "boolean"
},
"peak": {
"description": "Metric peak threshold (Upper metric limit)",
"type": "number"
},
"normal": {
"description": "Metric normal threshold",
"type": "number"
},
"caution": {
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
"type": "number"
},
"alert": {
"description": "Metric alert threshold (Requires immediate action)",
"type": "number"
},
"subClusters": {
"description": "Array of cluster hardware partition metric thresholds",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
},
"footprint": {
"description": "Is it a footprint metric and what type. Overwrite global setting",
"type": "string",
"enum": [
"avg",
"max",
"min"
]
},
"energy": {
"description": "Is it used to calculate job energy. Overwrite global",
"type": "string",
"enum": [
"power",
"energy"
]
},
"lowerIsBetter": {
"description": "Is lower better. Overwrite global",
"type": "boolean"
},
"peak": {
"description": "The maximum possible metric value",
"type": "number"
},
"normal": {
"description": "A common metric value level",
"type": "number"
},
"caution": {
"description": "Metric value requires attention",
"type": "number"
},
"alert": {
"description": "Metric value requiring immediate attention",
"type": "number"
},
"remove": {
"description": "Remove this metric for this subcluster",
"type": "boolean"
}
},
"required": [
"name"
]
}
}
},
"required": [
"name",
"unit",
"scope",
"timestep",
"aggregation",
"peak",
"normal",
"caution",
"alert"
]
},
"minItems": 1
},
"subClusters": {
"description": "Array of cluster hardware partitions",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
},
"processorType": {
"description": "Processor type",
"type": "string"
},
"socketsPerNode": {
"description": "Number of sockets per node",
"type": "integer"
},
"coresPerSocket": {
"description": "Number of cores per socket",
"type": "integer"
},
"threadsPerCore": {
"description": "Number of SMT threads per core",
"type": "integer"
},
"flopRateScalar": {
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"flopRateSimd": {
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"memoryBandwidth": {
"description": "Theoretical node peak memory bandwidth in GB/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"nodes": {
"description": "Node list expression",
"type": "string"
},
"topology": {
"description": "Node topology",
"type": "object",
"properties": {
"node": {
"description": "HwTread lists of node",
"type": "array",
"items": {
"type": "integer"
}
},
"socket": {
"description": "HwTread lists of sockets",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"memoryDomain": {
"description": "HwTread lists of memory domains",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"die": {
"description": "HwTread lists of dies",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"core": {
"description": "HwTread lists of cores",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"accelerators": {
"type": "array",
"description": "List of of accelerator devices",
"items": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The unique device id"
},
"type": {
"type": "string",
"description": "The accelerator type",
"enum": [
"Nvidia GPU",
"AMD GPU",
"Intel GPU"
]
},
"model": {
"type": "string",
"description": "The accelerator model"
}
},
"required": [
"id",
"type",
"model"
]
}
}
},
"required": [
"node",
"socket",
"memoryDomain"
]
}
},
"required": [
"name",
"nodes",
"topology",
"processorType",
"socketsPerNode",
"coresPerSocket",
"threadsPerCore",
"flopRateScalar",
"flopRateSimd",
"memoryBandwidth"
]
},
"minItems": 1
}
},
"required": [
"name",
"metricConfig",
"subClusters"
]
}

View File

@ -1,433 +1,498 @@
{
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://config.schema.json",
"title": "cc-backend configuration file schema",
"type": "object",
"properties": {
"addr": {
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
"type": "string"
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://config.schema.json",
"title": "cc-backend configuration file schema",
"type": "object",
"properties": {
"addr": {
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
"type": "string"
},
"apiAllowedIPs": {
"description": "Addresses from which secured API endpoints can be reached",
"type": "array",
"items": {
"type": "string"
}
},
"user": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
},
"group": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
},
"disable-authentication": {
"description": "Disable authentication (for everything: API, Web-UI, ...).",
"type": "boolean"
},
"embed-static-files": {
"description": "If all files in `web/frontend/public` should be served from within the binary itself (they are embedded) or not.",
"type": "boolean"
},
"static-files": {
"description": "Folder where static assets can be found, if embed-static-files is false.",
"type": "string"
},
"db-driver": {
"description": "sqlite3 or mysql (mysql will work for mariadb as well).",
"type": "string",
"enum": [
"sqlite3",
"mysql"
]
},
"db": {
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
"type": "string"
},
"archive": {
"description": "Configuration keys for job-archive",
"type": "object",
"properties": {
"kind": {
"description": "Backend type for job-archive",
"type": "string",
"enum": [
"file",
"s3"
]
},
"user": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
"path": {
"description": "Path to job archive for file backend",
"type": "string"
},
"group": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
"compression": {
"description": "Setup automatic compression for jobs older than number of days",
"type": "integer"
},
"disable-authentication": {
"description": "Disable authentication (for everything: API, Web-UI, ...).",
"type": "boolean"
},
"embed-static-files": {
"description": "If all files in `web/frontend/public` should be served from within the binary itself (they are embedded) or not.",
"type": "boolean"
},
"static-files": {
"description": "Folder where static assets can be found, if embed-static-files is false.",
"type": "string"
},
"db-driver": {
"description": "sqlite3 or mysql (mysql will work for mariadb as well).",
"type": "string",
"enum": [
"sqlite3",
"mysql"
]
},
"db": {
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
"type": "string"
},
"job-archive": {
"description": "Configuration keys for job-archive",
"type": "object",
"properties": {
"kind": {
"description": "Backend type for job-archive",
"type": "string",
"enum": [
"file",
"s3"
]
},
"path": {
"description": "Path to job archive for file backend",
"type": "string"
},
"compression": {
"description": "Setup automatic compression for jobs older than number of days",
"type": "integer"
},
"retention": {
"description": "Configuration keys for retention",
"type": "object",
"properties": {
"policy": {
"description": "Retention policy",
"type": "string",
"enum": [
"none",
"delete",
"move"
]
},
"includeDB": {
"description": "Also remove jobs from database",
"type": "boolean"
},
"age": {
"description": "Act on jobs with startTime older than age (in days)",
"type": "integer"
},
"location": {
"description": "The target directory for retention. Only applicable for retention move.",
"type": "string"
}
},
"required": [
"policy"
]
}
"retention": {
"description": "Configuration keys for retention",
"type": "object",
"properties": {
"policy": {
"description": "Retention policy",
"type": "string",
"enum": [
"none",
"delete",
"move"
]
},
"required": [
"kind"
]
"includeDB": {
"description": "Also remove jobs from database",
"type": "boolean"
},
"age": {
"description": "Act on jobs with startTime older than age (in days)",
"type": "integer"
},
"location": {
"description": "The target directory for retention. Only applicable for retention move.",
"type": "string"
}
},
"required": [
"policy"
]
}
},
"required": [
"kind"
]
},
"disable-archive": {
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
"type": "boolean"
},
"validate": {
"description": "Validate all input json documents against json schema.",
"type": "boolean"
},
"session-max-age": {
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
"type": "string"
},
"https-cert-file": {
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
"type": "string"
},
"https-key-file": {
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
"type": "string"
},
"redirect-http-to": {
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
"type": "string"
},
"stop-jobs-exceeding-walltime": {
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
"type": "integer"
},
"short-running-jobs-duration": {
"description": "Do not show running jobs shorter than X seconds.",
"type": "integer"
},
"emission-constant": {
"description": ".",
"type": "integer"
},
"cron-frequency": {
"description": "Frequency of cron job workers.",
"type": "object",
"properties": {
"duration-worker": {
"description": "Duration Update Worker [Defaults to '5m']",
"type": "string"
},
"disable-archive": {
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
"type": "boolean"
"footprint-worker": {
"description": "Metric-Footprint Update Worker [Defaults to '10m']",
"type": "string"
}
}
},
"enable-resampling": {
"description": "Enable dynamic zoom in frontend metric plots.",
"type": "object",
"properties": {
"trigger": {
"description": "Trigger next zoom level at less than this many visible datapoints.",
"type": "integer"
},
"validate": {
"description": "Validate all input json documents against json schema.",
"type": "boolean"
},
"session-max-age": {
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
"type": "string"
},
"https-cert-file": {
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
"type": "string"
},
"https-key-file": {
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
"type": "string"
},
"redirect-http-to": {
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
"type": "string"
},
"stop-jobs-exceeding-walltime": {
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
"resolutions": {
"description": "Array of resampling target resolutions, in seconds.",
"type": "array",
"items": {
"type": "integer"
}
}
},
"required": [
"trigger",
"resolutions"
]
},
"jwts": {
"description": "For JWT token authentication.",
"type": "object",
"properties": {
"max-age": {
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
"type": "string"
},
"short-running-jobs-duration": {
"description": "Do not show running jobs shorter than X seconds.",
"type": "integer"
"cookieName": {
"description": "Cookie that should be checked for a JWT token.",
"type": "string"
},
"jwts": {
"description": "For JWT token authentication.",
"validateUser": {
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
"type": "boolean"
},
"trustedIssuer": {
"description": "Issuer that should be accepted when validating external JWTs ",
"type": "string"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
"type": "boolean"
}
},
"required": [
"max-age"
]
},
"oidc": {
"provider": {
"description": "",
"type": "string"
},
"syncUserOnLogin": {
"description": "",
"type": "boolean"
},
"updateUserOnLogin": {
"description": "",
"type": "boolean"
},
"required": [
"provider"
]
},
"ldap": {
"description": "For LDAP Authentication and user synchronisation.",
"type": "object",
"properties": {
"url": {
"description": "URL of LDAP directory server.",
"type": "string"
},
"user_base": {
"description": "Base DN of user tree root.",
"type": "string"
},
"search_dn": {
"description": "DN for authenticating LDAP admin account with general read rights.",
"type": "string"
},
"user_bind": {
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
"type": "string"
},
"user_filter": {
"description": "Filter to extract users for syncing.",
"type": "string"
},
"username_attr": {
"description": "Attribute with full username. Default: gecos",
"type": "string"
},
"sync_interval": {
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
"type": "string"
},
"sync_del_old_users": {
"description": "Delete obsolete users in database.",
"type": "boolean"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
"type": "boolean"
}
},
"required": [
"url",
"user_base",
"search_dn",
"user_bind",
"user_filter"
]
},
"clusters": {
"description": "Configuration for the clusters to be displayed.",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "The name of the cluster.",
"type": "string"
},
"metricDataRepository": {
"description": "Type of the metric data repository for this cluster",
"type": "object",
"properties": {
"max-age": {
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
"type": "string"
},
"cookieName": {
"description": "Cookie that should be checked for a JWT token.",
"type": "string"
},
"validateUser": {
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
"type": "boolean"
},
"trustedIssuer": {
"description": "Issuer that should be accepted when validating external JWTs ",
"type": "string"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
"type": "boolean"
}
"kind": {
"type": "string",
"enum": [
"influxdb",
"prometheus",
"cc-metric-store",
"test"
]
},
"url": {
"type": "string"
},
"token": {
"type": "string"
}
},
"required": [
"max-age"
"kind",
"url"
]
},
"ldap": {
"description": "For LDAP Authentication and user synchronisation.",
},
"filterRanges": {
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
"type": "object",
"properties": {
"url": {
"description": "URL of LDAP directory server.",
"type": "string"
},
"user_base": {
"description": "Base DN of user tree root.",
"type": "string"
},
"search_dn": {
"description": "DN for authenticating LDAP admin account with general read rights.",
"type": "string"
},
"user_bind": {
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
"type": "string"
},
"user_filter": {
"description": "Filter to extract users for syncing.",
"type": "string"
},
"username_attr": {
"description": "Attribute with full username. Default: gecos",
"type": "string"
},
"sync_interval": {
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
"type": "string"
},
"sync_del_old_users": {
"description": "Delete obsolete users in database.",
"type": "boolean"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
"type": "boolean"
}
},
"required": [
"url",
"user_base",
"search_dn",
"user_bind",
"user_filter"
]
},
"clusters": {
"description": "Configuration for the clusters to be displayed.",
"type": "array",
"items": {
"numNodes": {
"description": "UI slider range for number of nodes",
"type": "object",
"properties": {
"name": {
"description": "The name of the cluster.",
"type": "string"
},
"metricDataRepository": {
"description": "Type of the metric data repository for this cluster",
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": [
"influxdb",
"prometheus",
"cc-metric-store",
"test"
]
},
"url": {
"type": "string"
},
"token": {
"type": "string"
}
},
"required": [
"kind",
"url"
]
},
"filterRanges": {
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
"type": "object",
"properties": {
"numNodes": {
"description": "UI slider range for number of nodes",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": [
"from",
"to"
]
},
"duration": {
"description": "UI slider range for duration",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": [
"from",
"to"
]
},
"startTime": {
"description": "UI slider range for start time",
"type": "object",
"properties": {
"from": {
"type": "string",
"format": "date-time"
},
"to": {
"type": "null"
}
},
"required": [
"from",
"to"
]
}
},
"required": [
"numNodes",
"duration",
"startTime"
]
}
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": [
"name",
"metricDataRepository",
"filterRanges"
],
"minItems": 1
}
},
"ui-defaults": {
"description": "Default configuration for web UI",
"type": "object",
"properties": {
"plot_general_colorBackground": {
"description": "Color plot background according to job average threshold limits",
"type": "boolean"
},
"plot_general_lineWidth": {
"description": "Initial linewidth",
"from",
"to"
]
},
"duration": {
"description": "UI slider range for duration",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"plot_list_jobsPerPage": {
"description": "Jobs shown per page in job lists",
},
"to": {
"type": "integer"
}
},
"plot_view_plotsPerRow": {
"description": "Number of plots per row in single job view",
"type": "integer"
"required": [
"from",
"to"
]
},
"startTime": {
"description": "UI slider range for start time",
"type": "object",
"properties": {
"from": {
"type": "string",
"format": "date-time"
},
"to": {
"type": "null"
}
},
"plot_view_showPolarplot": {
"description": "Option to toggle polar plot in single job view",
"type": "boolean"
},
"plot_view_showRoofline": {
"description": "Option to toggle roofline plot in single job view",
"type": "boolean"
},
"plot_view_showStatTable": {
"description": "Option to toggle the node statistic table in single job view",
"type": "boolean"
},
"system_view_selectedMetric": {
"description": "Initial metric shown in system view",
"type": "string"
},
"analysis_view_histogramMetrics": {
"description": "Metrics to show as job count histograms in analysis view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"analysis_view_scatterPlotMetrics": {
"description": "Initial scatter plto configuration in analysis view",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "string",
"minItems": 2,
"maxItems": 2
},
"minItems": 1
}
},
"job_view_nodestats_selectedMetrics": {
"description": "Initial metrics shown in node statistics table of single job view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"job_view_polarPlotMetrics": {
"description": "Metrics shown in polar plot of single job view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"job_view_selectedMetrics": {
"description": "",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"plot_general_colorscheme": {
"description": "Initial color scheme",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"plot_list_selectedMetrics": {
"description": "Initial metric plots shown in jobs lists",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
}
"required": [
"from",
"to"
]
}
},
"required": [
"plot_general_colorBackground",
"plot_general_lineWidth",
"plot_list_jobsPerPage",
"plot_view_plotsPerRow",
"plot_view_showPolarplot",
"plot_view_showRoofline",
"plot_view_showStatTable",
"system_view_selectedMetric",
"analysis_view_histogramMetrics",
"analysis_view_scatterPlotMetrics",
"job_view_nodestats_selectedMetrics",
"job_view_polarPlotMetrics",
"job_view_selectedMetrics",
"plot_general_colorscheme",
"plot_list_selectedMetrics"
"numNodes",
"duration",
"startTime"
]
}
}
},
"required": [
"name",
"metricDataRepository",
"filterRanges"
],
"minItems": 1
}
},
"required": [
"jwts",
"clusters"
]
"ui-defaults": {
"description": "Default configuration for web UI",
"type": "object",
"properties": {
"plot_general_colorBackground": {
"description": "Color plot background according to job average threshold limits",
"type": "boolean"
},
"plot_general_lineWidth": {
"description": "Initial linewidth",
"type": "integer"
},
"plot_list_jobsPerPage": {
"description": "Jobs shown per page in job lists",
"type": "integer"
},
"plot_view_plotsPerRow": {
"description": "Number of plots per row in single job view",
"type": "integer"
},
"plot_view_showPolarplot": {
"description": "Option to toggle polar plot in single job view",
"type": "boolean"
},
"plot_view_showRoofline": {
"description": "Option to toggle roofline plot in single job view",
"type": "boolean"
},
"plot_view_showStatTable": {
"description": "Option to toggle the node statistic table in single job view",
"type": "boolean"
},
"system_view_selectedMetric": {
"description": "Initial metric shown in system view",
"type": "string"
},
"job_view_showFootprint": {
"description": "Option to toggle footprint ui in single job view",
"type": "boolean"
},
"job_list_usePaging": {
"description": "Option to switch from continous scroll to paging",
"type": "boolean"
},
"analysis_view_histogramMetrics": {
"description": "Metrics to show as job count histograms in analysis view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"analysis_view_scatterPlotMetrics": {
"description": "Initial scatter plto configuration in analysis view",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "string",
"minItems": 2,
"maxItems": 2
},
"minItems": 1
}
},
"job_view_nodestats_selectedMetrics": {
"description": "Initial metrics shown in node statistics table of single job view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"job_view_selectedMetrics": {
"description": "Initial metrics shown as plots in single job view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"plot_general_colorscheme": {
"description": "Initial color scheme",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"plot_list_selectedMetrics": {
"description": "Initial metric plots shown in jobs lists",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
}
},
"required": [
"plot_general_colorBackground",
"plot_general_lineWidth",
"plot_list_jobsPerPage",
"plot_view_plotsPerRow",
"plot_view_showPolarplot",
"plot_view_showRoofline",
"plot_view_showStatTable",
"system_view_selectedMetric",
"job_view_showFootprint",
"job_list_usePaging",
"analysis_view_histogramMetrics",
"analysis_view_scatterPlotMetrics",
"job_view_nodestats_selectedMetrics",
"job_view_selectedMetrics",
"plot_general_colorscheme",
"plot_list_selectedMetrics"
]
}
},
"required": [
"jwts",
"clusters",
"apiAllowedIPs"
]
}

Some files were not shown because too many files have changed in this diff Show More