mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-28 21:37:31 +01:00
Compare commits
393 Commits
hotfix
...
optimize-c
| Author | SHA1 | Date | |
|---|---|---|---|
| 23ce1722a9 | |||
| 6ecb934967 | |||
|
|
348b6010e8 | ||
|
|
0a0db36433 | ||
| 92236b1d1d | |||
|
|
df3bc111a4 | ||
| 9c0104a252 | |||
| 3555fb6255 | |||
| cc21e0e62c | |||
| 45a1bc78b7 | |||
|
|
ff180affd7 | ||
| 5d136634a2 | |||
| 998aff2345 | |||
|
|
0dea959391 | ||
| dadcb983e7 | |||
| 8989b7a410 | |||
|
|
bcfe9022de | ||
| 62d2143979 | |||
| 9fc1836c30 | |||
|
|
31f3c28294 | ||
|
|
12c01655c3 | ||
| ab55ce91a1 | |||
| b6c574c7ec | |||
| 03c65e06f6 | |||
| defa8fa994 | |||
| c9d8de0d56 | |||
| 86fbecc679 | |||
| 8ee6c09e9b | |||
| fc1ba1f5b3 | |||
| 82e79b074a | |||
|
|
bae7ec11b4 | ||
|
|
2d90fd05d6 | ||
|
e1c1148160
|
|||
|
dc161ec421
|
|||
|
abd11d783b
|
|||
|
064aa0a238
|
|||
|
f00f9fcee0
|
|||
|
|
705d70ddc0 | ||
|
|
7789489d08 | ||
|
|
62cd21eb83 | ||
|
8b7cb587c5
|
|||
|
415467967d
|
|||
|
90b52f997d
|
|||
|
745c0357f3
|
|||
|
|
087c00eb7f | ||
|
|
facfc9d4c9 | ||
|
|
89e06c3530 | ||
|
57536d982c
|
|||
|
|
0c0f423b84 | ||
|
7bd79dcc3c
|
|||
|
2a659915a4
|
|||
|
|
c77e2969c5 | ||
|
d8ad6dd3f0
|
|||
|
cd8b574cca
|
|||
|
2da35909c1
|
|||
|
2e24fde430
|
|||
|
757be60b22
|
|||
|
589149790f
|
|||
|
29c440a637
|
|||
|
6035b62734
|
|||
|
|
fe3ebe0abc | ||
|
|
a52d7d017f | ||
|
9af44779aa
|
|||
|
|
1cf2c41bd7 | ||
|
|
2eeefc2720 | ||
|
|
0dd894890f | ||
| 25ff094bdf | |||
| d2ff4a2e02 | |||
| 4b983e3b9b | |||
| 98f9c18f72 | |||
| 7d8b305cd9 | |||
| 6e3462f962 | |||
| 2c8608f5a4 | |||
|
|
3215bc3de0 | ||
| 140c7f6e47 | |||
| c15f1117f5 | |||
|
|
48729b172d | ||
|
|
76ce8122e2 | ||
| f016bd4232 | |||
| 54ea5d7900 | |||
| 865cd3db54 | |||
| 90c8fbf07c | |||
| f4ee0d1042 | |||
|
|
e75da7f8cc | ||
|
|
12e9f6700e | ||
|
bca7dd743b
|
|||
|
8d6c6b819b
|
|||
|
|
d07a537f0b | ||
|
|
5e4994a64c | ||
|
|
a5a1fd1a6a | ||
|
|
49a1748641 | ||
|
|
a2f0b57ab9 | ||
| 0dff9fa07f | |||
| 1feb3baf68 | |||
| d21943a514 | |||
| 035ac2384e | |||
| ac7eb93141 | |||
|
|
51e9d33f9f | ||
|
|
d1e7ea09bc | ||
|
|
7dd3ee3084 | ||
| 1980ef5f43 | |||
| fd9b76c6a7 | |||
| abdd7ea6f1 | |||
| c7b366f35f | |||
|
|
84e715a273 | ||
| 396a628175 | |||
| 624746f34b | |||
| 2b395a94e6 | |||
| f6aa40d927 | |||
| c920c57f5d | |||
| 363e839c49 | |||
|
|
e681e9e7ec | ||
| a8194de492 | |||
|
|
1145b31a49 | ||
|
|
b7df1f56ef | ||
|
|
c43d4a0f16 | ||
| a8d385a1ee | |||
| 5579b6f40c | |||
| 7123a8c1cc | |||
|
|
9cc09145ec | ||
| 6294f8e263 | |||
| 0adf2bad92 | |||
| a85f72fccd | |||
| db8772dc0b | |||
| fa7727c6ca | |||
|
|
5655639320 | ||
|
|
720f40c9c9 | ||
| f671d8df90 | |||
| 265da42385 | |||
| b160284a1b | |||
|
|
fcb37b0367 | ||
| 0984c1d431 | |||
|
|
276559d120 | ||
|
|
e3148b16eb | ||
|
|
4d13c37008 | ||
|
|
84d7a7aa7d | ||
|
|
5616801f3e | ||
|
|
b5f6ee9c0c | ||
|
|
af73ce9c6d | ||
|
|
a7a95bb866 | ||
|
5d7dd62b72
|
|||
|
46fb52d67e
|
|||
|
|
39b8356683 | ||
|
42ce598865
|
|||
|
0d62a300e7
|
|||
|
|
3cf88f757c | ||
|
75a74c162d
|
|||
|
248f11f4f8
|
|||
|
|
75e849922d | ||
|
|
d39b955b25 | ||
|
00a41373e8
|
|||
|
|
e9cd6b4225 | ||
|
|
13cca1ee62 | ||
|
|
7b4e2fcf59 | ||
|
|
f2285e603b | ||
|
|
b7bd8210e5 | ||
|
|
1791e665aa | ||
|
|
a71341064e | ||
|
|
74dbbaa794 | ||
|
|
732fab4a04 | ||
|
|
aa3fcbfe17 | ||
| 5e58c9f376 | |||
| b600eeca5e | |||
|
|
e6662c4592 | ||
|
|
1ffcc5e241 | ||
|
|
32f0664012 | ||
|
|
2ef1826b12 | ||
|
|
d397457ce6 | ||
|
|
e8c81ba7d4 | ||
|
|
318dbd65e0 | ||
|
|
dd56e75b50 | ||
|
|
df93786474 | ||
|
|
4deec9a170 | ||
|
|
f26cabbdf1 | ||
|
|
195a1edcfe | ||
|
|
7101d2bb3b | ||
|
|
3452891613 | ||
|
|
b25abc5f16 | ||
|
|
60a847922e | ||
| 0d857b49a2 | |||
| eb5aa9ad02 | |||
| 98661aad15 | |||
| 69739ffdfd | |||
| 95689e3c99 | |||
|
|
9d9babe94d | ||
| 9d15a87c88 | |||
|
|
719aaaff4b | ||
| bbde91a1f9 | |||
| 55cb2cb6d6 | |||
| 752e19c276 | |||
|
|
dac382af53 | ||
| 28a3ff8d67 | |||
|
|
ae81687da9 | ||
|
|
2173d3527d | ||
| 2859f12dc1 | |||
| b307e885ce | |||
| 4853814228 | |||
| ca08717b9d | |||
|
|
934bc13c2c | ||
|
|
4aa337ccc8 | ||
|
|
700f2aad55 | ||
|
|
836e6e4242 | ||
| b5182c4c13 | |||
|
|
808e281ee8 | ||
| b9e65b50db | |||
| 0ea836c69c | |||
|
|
e074bb315c | ||
|
|
084d00cb0d | ||
| 7ecfc8468e | |||
| c782043c64 | |||
|
|
fbf4004e92 | ||
|
|
a2c1b65f91 | ||
|
|
0af550bf4e | ||
|
|
436194e46d | ||
|
|
49938bcef8 | ||
|
|
da2a78faa3 | ||
|
|
98dc8cf5b5 | ||
|
|
cd810b45ec | ||
|
|
22b1d4d276 | ||
|
|
25c5457ef3 | ||
|
|
ea6b9d910b | ||
|
|
5567371ccd | ||
|
|
585c4fcace | ||
| 525d99140f | |||
| 499b4287f9 | |||
|
|
b7df4f7cca | ||
| f41301036b | |||
| 30516776e5 | |||
| 07afcc4cd4 | |||
|
|
05abea87e7 | ||
|
|
4459840f5f | ||
|
|
55e0456aac | ||
|
|
f18ae35030 | ||
|
|
f416be77f7 | ||
| 1d4c79c821 | |||
| d4edbd7d1a | |||
| 5281f3bb60 | |||
|
|
e91fbf405f | ||
|
|
1d41ff8190 | ||
|
|
77a2a256e4 | ||
|
|
eb09504306 | ||
|
|
b912be5978 | ||
|
|
1a41629535 | ||
|
|
b81d9b05ac | ||
|
|
1d62ee1e22 | ||
|
|
55d2c7d7eb | ||
|
|
bb527fb410 | ||
| 9a97d0e8eb | |||
| 93dcfee8c5 | |||
| 76139ef53c | |||
|
|
32319adf72 | ||
|
|
10a5c89a16 | ||
|
|
40bff1eff9 | ||
|
|
ceba4eb0c6 | ||
|
|
faacf3f343 | ||
|
|
7cd98c4f25 | ||
|
|
489ad44b9f | ||
|
|
02a8cf05d1 | ||
| 7db2bbe6b0 | |||
| b6f0faa97f | |||
| a3fffa8e8b | |||
| 72248defbf | |||
| 155e05495e | |||
| 9c92a7796b | |||
| 7c78407c49 | |||
| cb219b3c74 | |||
| d59aa2e855 | |||
|
|
cd3d133f0d | ||
|
|
3b7fc44ce9 | ||
| e1efc68476 | |||
| 8f0bb907ff | |||
|
|
e5c620ca20 | ||
|
|
d0bcfb90e6 | ||
|
|
9deee54e41 | ||
|
|
94b86ef11a | ||
|
|
d8cd752dcb | ||
|
|
5d376e6865 | ||
| 9c3beddf54 | |||
| c6465ad9e5 | |||
| d415381d4a | |||
| 211d4fae54 | |||
|
|
3276ed7785 | ||
|
|
77b7548ef3 | ||
|
|
59851f410e | ||
|
|
4cb8d648cb | ||
| c8627a13f4 | |||
| 0ea0270fe1 | |||
| 19402d30af | |||
| b2f870e3c0 | |||
| 9e542dc200 | |||
| 6cf59043a3 | |||
| 71b75eea0e | |||
|
e900a686db
|
|||
|
fb8db3c3ae
|
|||
|
|
170a9ace8a | ||
|
|
518e9950ea | ||
|
25c8fca561
|
|||
| 754f7e16f6 | |||
| 04a2e460ae | |||
| 2ebab1e2e2 | |||
| a9366d14c6 | |||
| 42809e3f75 | |||
| 4cec933349 | |||
| d3f3c532b1 | |||
| ad1e87d0b8 | |||
|
|
affa85c086 | ||
|
|
aa053d78f7 | ||
|
|
fae6d9d835 | ||
|
|
78f1db7ad1 | ||
| f1367f84f8 | |||
|
|
4c81696f4d | ||
|
|
a91f8f72e3 | ||
|
|
87f7ed329c | ||
|
|
8641d9053d | ||
|
|
4a5ab8a279 | ||
|
|
d179412ab6 | ||
|
|
968c7d179d | ||
| 56399523d7 | |||
| 4d6326b8be | |||
|
|
a2414791bf | ||
|
|
faf3a19f0c | ||
|
|
4e6038d6c1 | ||
|
|
ddc2ecf829 | ||
| ecb5aef735 | |||
| 11ec2267da | |||
| 8576ae458d | |||
|
|
c66445acb5 | ||
|
|
29a20f7b0b | ||
|
|
874c019fb6 | ||
|
|
54825626de | ||
| 9bf5c5dc1a | |||
| 64fef9774c | |||
| 999667ec0c | |||
| c1135531ba | |||
| 287256e5f1 | |||
| 0bc26aa194 | |||
|
|
502d7e9084 | ||
|
|
89875db4a9 | ||
|
|
5a8b929448 | ||
|
|
fe78f2f433 | ||
|
|
e37591ce6d | ||
| 1cd4a57bd3 | |||
| b35172e2f7 | |||
| 3cfcd30128 | |||
| e56532e5c8 | |||
| fdee4f8938 | |||
|
|
7acc89e42d | ||
|
|
af7d208c21 | ||
|
|
91b90d033e | ||
|
|
7a0975b94d | ||
|
|
c58b01a602 | ||
|
|
8244449646 | ||
|
|
436afa4a61 | ||
|
|
998f800632 | ||
| 06ed056d43 | |||
| d446c13546 | |||
| 6e74fa294a | |||
|
|
43bdb56072 | ||
|
|
10a0b0add8 | ||
| e707fd0893 | |||
|
|
19c8e9beb1 | ||
|
|
32e5353847 | ||
|
|
d2f2d78954 | ||
| b8fdfc30c0 | |||
| 79a2ca8ae8 | |||
| d1a78c13a4 | |||
| f4b00e9de1 | |||
| 0a5e155096 | |||
| 4ecc050c4c | |||
| 88dc5036b3 | |||
| d30c6ef3bf | |||
|
0419fec810
|
|||
|
43e5fd1131
|
|||
|
|
11e94124cc | ||
|
|
102109388b | ||
|
|
60a69aa0a2 | ||
| 5e2cbd75fa | |||
| 14f1192ccb | |||
| 72b2560ecf | |||
| 7fce6fa401 | |||
| e6286768a7 | |||
| 0306723307 | |||
| 6f49998ad3 | |||
| 457c944ec6 | |||
| 33c38f9464 | |||
| 46351389b6 | |||
|
|
d56b0e93db | ||
|
|
c5aff1a2ca | ||
|
|
79e1c236fe | ||
|
|
0d62181272 | ||
|
|
290a71bd48 | ||
|
|
6e385db378 | ||
|
|
4083de2a51 |
15
.github/dependabot.yml
vendored
15
.github/dependabot.yml
vendored
@@ -1,15 +0,0 @@
|
||||
# To get started with Dependabot version updates, you'll need to specify which
|
||||
# package ecosystems to update and where the package manifests are located.
|
||||
# Please see the documentation for all configuration options:
|
||||
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
||||
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "gomod"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "npm"
|
||||
directory: "/web/frontend"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -13,7 +13,7 @@
|
||||
/var/checkpoints*
|
||||
|
||||
migrateTimestamps.pl
|
||||
test_ccms_write_api.sh
|
||||
test_ccms_*
|
||||
|
||||
/web/frontend/public/build
|
||||
/web/frontend/node_modules
|
||||
|
||||
26
AGENTS.md
Normal file
26
AGENTS.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# ClusterCockpit Backend - Agent Guidelines
|
||||
|
||||
## Build/Test Commands
|
||||
|
||||
- Build: `make` or `go build ./cmd/cc-backend`
|
||||
- Run all tests: `make test` (runs: `go clean -testcache && go build ./... && go vet ./... && go test ./...`)
|
||||
- Run single test: `go test -run TestName ./path/to/package`
|
||||
- Run single test file: `go test ./path/to/package -run TestName`
|
||||
- Frontend build: `cd web/frontend && npm install && npm run build`
|
||||
- Generate GraphQL: `make graphql` (uses gqlgen)
|
||||
- Generate Swagger: `make swagger` (uses swaggo/swag)
|
||||
|
||||
## Code Style
|
||||
|
||||
- **Formatting**: Use `gofumpt` for all Go files (strict requirement)
|
||||
- **Copyright header**: All files must include copyright header (see existing files)
|
||||
- **Package docs**: Document packages with comprehensive package-level comments explaining purpose, usage, configuration
|
||||
- **Imports**: Standard library first, then external packages, then internal packages (grouped with blank lines)
|
||||
- **Naming**: Use camelCase for private, PascalCase for exported; descriptive names (e.g., `JobRepository`, `handleError`)
|
||||
- **Error handling**: Return errors, don't panic; use custom error types where appropriate; log with cclog package
|
||||
- **Logging**: Use `cclog` package (e.g., `cclog.Errorf()`, `cclog.Warnf()`, `cclog.Debugf()`)
|
||||
- **Testing**: Use standard `testing` package; use `testify/assert` for assertions; name tests `TestFunctionName`
|
||||
- **Comments**: Document all exported functions/types with godoc-style comments
|
||||
- **Structs**: Document fields with inline comments, especially for complex configurations
|
||||
- **HTTP handlers**: Return proper status codes; use `handleError()` helper for consistent error responses
|
||||
- **JSON**: Use struct tags for JSON marshaling; `DisallowUnknownFields()` for strict decoding
|
||||
306
CLAUDE.md
Normal file
306
CLAUDE.md
Normal file
@@ -0,0 +1,306 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with
|
||||
code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
ClusterCockpit is a job-specific performance monitoring framework for HPC
|
||||
clusters. This is a Golang backend that provides REST and GraphQL APIs, serves a
|
||||
Svelte-based frontend, and manages job archives and metric data from various
|
||||
time-series databases.
|
||||
|
||||
## Build and Development Commands
|
||||
|
||||
### Building
|
||||
|
||||
```bash
|
||||
# Build everything (frontend + backend)
|
||||
make
|
||||
|
||||
# Build only the frontend
|
||||
make frontend
|
||||
|
||||
# Build only the backend (requires frontend to be built first)
|
||||
go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.5.0 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
```bash
|
||||
# Run all tests
|
||||
make test
|
||||
|
||||
# Run tests with verbose output
|
||||
go test -v ./...
|
||||
|
||||
# Run tests for a specific package
|
||||
go test ./internal/repository
|
||||
```
|
||||
|
||||
### Code Generation
|
||||
|
||||
```bash
|
||||
# Regenerate GraphQL schema and resolvers (after modifying api/schema.graphqls)
|
||||
make graphql
|
||||
|
||||
# Regenerate Swagger/OpenAPI docs (after modifying API comments)
|
||||
make swagger
|
||||
```
|
||||
|
||||
### Frontend Development
|
||||
|
||||
```bash
|
||||
cd web/frontend
|
||||
|
||||
# Install dependencies
|
||||
npm install
|
||||
|
||||
# Build for production
|
||||
npm run build
|
||||
|
||||
# Development mode with watch
|
||||
npm run dev
|
||||
```
|
||||
|
||||
### Running
|
||||
|
||||
```bash
|
||||
# Initialize database and create admin user
|
||||
./cc-backend -init-db -add-user demo:admin:demo
|
||||
|
||||
# Start server in development mode (enables GraphQL Playground and Swagger UI)
|
||||
./cc-backend -server -dev -loglevel info
|
||||
|
||||
# Start demo with sample data
|
||||
./startDemo.sh
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
### Backend Structure
|
||||
|
||||
The backend follows a layered architecture with clear separation of concerns:
|
||||
|
||||
- **cmd/cc-backend**: Entry point, orchestrates initialization of all subsystems
|
||||
- **internal/repository**: Data access layer using repository pattern
|
||||
- Abstracts database operations (SQLite3 only)
|
||||
- Implements LRU caching for performance
|
||||
- Provides repositories for Job, User, Node, and Tag entities
|
||||
- Transaction support for batch operations
|
||||
- **internal/api**: REST API endpoints (Swagger/OpenAPI documented)
|
||||
- **internal/graph**: GraphQL API (uses gqlgen)
|
||||
- Schema in `api/schema.graphqls`
|
||||
- Generated code in `internal/graph/generated/`
|
||||
- Resolvers in `internal/graph/schema.resolvers.go`
|
||||
- **internal/auth**: Authentication layer
|
||||
- Supports local accounts, LDAP, OIDC, and JWT tokens
|
||||
- Implements rate limiting for login attempts
|
||||
- **pkg/metricstore**: Metric store with data loading API
|
||||
- In-memory metric storage with checkpointing
|
||||
- Query API for loading job metric data
|
||||
- **internal/archiver**: Job archiving to file-based archive
|
||||
- **internal/api/nats.go**: NATS-based API for job and node operations
|
||||
- Subscribes to NATS subjects for job events (start/stop)
|
||||
- Handles node state updates via NATS
|
||||
- Uses InfluxDB line protocol message format
|
||||
- **pkg/archive**: Job archive backend implementations
|
||||
- File system backend (default)
|
||||
- S3 backend
|
||||
- SQLite backend (experimental)
|
||||
- **parquet** sub-package: Parquet format support (schema, reader, writer, conversion)
|
||||
- **internal/metricstoreclient**: Client for cc-metric-store queries
|
||||
|
||||
### Frontend Structure
|
||||
|
||||
- **web/frontend**: Svelte 5 application
|
||||
- Uses Rollup for building
|
||||
- Components organized by feature (analysis, job, user, etc.)
|
||||
- GraphQL client using @urql/svelte
|
||||
- Bootstrap 5 + SvelteStrap for UI
|
||||
- uPlot for time-series visualization
|
||||
- **web/templates**: Server-side Go templates
|
||||
|
||||
### Key Concepts
|
||||
|
||||
**Job Archive**: Completed jobs are stored in a file-based archive following the
|
||||
[ClusterCockpit job-archive
|
||||
specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
|
||||
Each job has a `meta.json` file with metadata and metric data files.
|
||||
|
||||
**Metric Data Repositories**: Time-series metric data is stored separately from
|
||||
job metadata. The system supports multiple backends (cc-metric-store is
|
||||
recommended). Configuration is per-cluster in `config.json`.
|
||||
|
||||
**Authentication Flow**:
|
||||
|
||||
1. Multiple authenticators can be configured (local, LDAP, OIDC, JWT)
|
||||
2. Each authenticator's `CanLogin` method is called to determine if it should handle the request
|
||||
3. The first authenticator that returns true performs the actual `Login`
|
||||
4. JWT tokens are used for API authentication
|
||||
|
||||
**Database Migrations**: SQL migrations in `internal/repository/migrations/sqlite3/` are
|
||||
applied automatically on startup. Version tracking in `version` table.
|
||||
|
||||
**Scopes**: Metrics can be collected at different scopes:
|
||||
|
||||
- Node scope (always available)
|
||||
- Core scope (for jobs with ≤8 nodes)
|
||||
- Accelerator scope (for GPU/accelerator metrics)
|
||||
|
||||
## Configuration
|
||||
|
||||
- **config.json**: Main configuration (clusters, metric repositories, archive settings)
|
||||
- `main.apiSubjects`: NATS subject configuration (optional)
|
||||
- `subjectJobEvent`: Subject for job start/stop events (e.g., "cc.job.event")
|
||||
- `subjectNodeState`: Subject for node state updates (e.g., "cc.node.state")
|
||||
- `nats`: NATS client connection configuration (optional)
|
||||
- `address`: NATS server address (e.g., "nats://localhost:4222")
|
||||
- `username`: Authentication username (optional)
|
||||
- `password`: Authentication password (optional)
|
||||
- `creds-file-path`: Path to NATS credentials file (optional)
|
||||
- **.env**: Environment variables (secrets like JWT keys)
|
||||
- Copy from `configs/env-template.txt`
|
||||
- NEVER commit this file
|
||||
- **cluster.json**: Cluster topology and metric definitions (loaded from archive or config)
|
||||
|
||||
## Database
|
||||
|
||||
- Default: SQLite 3 (`./var/job.db`)
|
||||
- Connection managed by `internal/repository`
|
||||
- Schema version in `internal/repository/migration.go`
|
||||
|
||||
## Code Generation
|
||||
|
||||
**GraphQL** (gqlgen):
|
||||
|
||||
- Schema: `api/schema.graphqls`
|
||||
- Config: `gqlgen.yml`
|
||||
- Generated code: `internal/graph/generated/`
|
||||
- Custom resolvers: `internal/graph/schema.resolvers.go`
|
||||
- Run `make graphql` after schema changes
|
||||
|
||||
**Swagger/OpenAPI**:
|
||||
|
||||
- Annotations in `internal/api/*.go`
|
||||
- Generated docs: `internal/api/docs.go`, `api/swagger.yaml`
|
||||
- Run `make swagger` after API changes
|
||||
|
||||
## Testing Conventions
|
||||
|
||||
- Test files use `_test.go` suffix
|
||||
- Test data in `testdata/` subdirectories
|
||||
- Repository tests use in-memory SQLite
|
||||
- API tests use httptest
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### Adding a new GraphQL field
|
||||
|
||||
1. Edit schema in `api/schema.graphqls`
|
||||
2. Run `make graphql`
|
||||
3. Implement resolver in `internal/graph/schema.resolvers.go`
|
||||
|
||||
### Adding a new REST endpoint
|
||||
|
||||
1. Add handler in `internal/api/*.go`
|
||||
2. Add route in `internal/api/rest.go`
|
||||
3. Add Swagger annotations
|
||||
4. Run `make swagger`
|
||||
|
||||
### Adding a new metric data backend
|
||||
|
||||
1. Implement metric loading functions in `pkg/metricstore/query.go`
|
||||
2. Add cluster configuration to metric store initialization
|
||||
3. Update config.json schema documentation
|
||||
|
||||
### Modifying database schema
|
||||
|
||||
1. Create new migration in `internal/repository/migrations/sqlite3/`
|
||||
2. Increment `repository.Version`
|
||||
3. Test with fresh database and existing database
|
||||
|
||||
## NATS API
|
||||
|
||||
The backend supports a NATS-based API as an alternative to the REST API for job and node operations.
|
||||
|
||||
### Setup
|
||||
|
||||
1. Configure NATS client connection in `config.json`:
|
||||
```json
|
||||
{
|
||||
"nats": {
|
||||
"address": "nats://localhost:4222",
|
||||
"username": "user",
|
||||
"password": "pass"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
2. Configure API subjects in `config.json` under `main`:
|
||||
```json
|
||||
{
|
||||
"main": {
|
||||
"apiSubjects": {
|
||||
"subjectJobEvent": "cc.job.event",
|
||||
"subjectNodeState": "cc.node.state"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Message Format
|
||||
|
||||
Messages use **InfluxDB line protocol** format with the following structure:
|
||||
|
||||
#### Job Events
|
||||
|
||||
**Start Job:**
|
||||
```
|
||||
job,function=start_job event="{\"jobId\":123,\"user\":\"alice\",\"cluster\":\"test\", ...}" 1234567890000000000
|
||||
```
|
||||
|
||||
**Stop Job:**
|
||||
```
|
||||
job,function=stop_job event="{\"jobId\":123,\"cluster\":\"test\",\"startTime\":1234567890,\"stopTime\":1234571490,\"jobState\":\"completed\"}" 1234571490000000000
|
||||
```
|
||||
|
||||
**Tags:**
|
||||
- `function`: Either `start_job` or `stop_job`
|
||||
|
||||
**Fields:**
|
||||
- `event`: JSON payload containing job data (see REST API documentation for schema)
|
||||
|
||||
#### Node State Updates
|
||||
|
||||
```json
|
||||
{
|
||||
"cluster": "testcluster",
|
||||
"nodes": [
|
||||
{
|
||||
"hostname": "node001",
|
||||
"states": ["allocated"],
|
||||
"cpusAllocated": 8,
|
||||
"memoryAllocated": 16384,
|
||||
"gpusAllocated": 0,
|
||||
"jobsRunning": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Implementation Notes
|
||||
|
||||
- NATS API mirrors REST API functionality but uses messaging
|
||||
- Job start/stop events are processed asynchronously
|
||||
- Duplicate job detection is handled (same as REST API)
|
||||
- All validation rules from REST API apply
|
||||
- Messages are logged; no responses are sent back to publishers
|
||||
- If NATS client is unavailable, API subscriptions are skipped (logged as warning)
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Go 1.24.0+ (check go.mod for exact version)
|
||||
- Node.js (for frontend builds)
|
||||
- SQLite 3 (only supported database)
|
||||
- Optional: NATS server for NATS API integration
|
||||
10
Makefile
10
Makefile
@@ -1,6 +1,6 @@
|
||||
TARGET = ./cc-backend
|
||||
FRONTEND = ./web/frontend
|
||||
VERSION = 1.4.4
|
||||
VERSION = 1.5.0
|
||||
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
||||
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
||||
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
||||
@@ -46,16 +46,16 @@ $(TARGET): $(SVELTE_TARGETS)
|
||||
|
||||
frontend:
|
||||
$(info ===> BUILD frontend)
|
||||
cd web/frontend && npm install && npm run build
|
||||
cd web/frontend && npm ci && npm run build
|
||||
|
||||
swagger:
|
||||
$(info ===> GENERATE swagger)
|
||||
@go run github.com/swaggo/swag/cmd/swag init --parseDependency -d ./internal/api -g rest.go -o ./api
|
||||
@go tool github.com/swaggo/swag/cmd/swag init --parseDependency -d ./internal/api -g rest.go -o ./api
|
||||
@mv ./api/docs.go ./internal/api/docs.go
|
||||
|
||||
graphql:
|
||||
$(info ===> GENERATE graphql)
|
||||
@go run github.com/99designs/gqlgen
|
||||
@go tool github.com/99designs/gqlgen
|
||||
|
||||
clean:
|
||||
$(info ===> CLEAN)
|
||||
@@ -84,4 +84,4 @@ $(VAR):
|
||||
|
||||
$(SVELTE_TARGETS): $(SVELTE_SRC)
|
||||
$(info ===> BUILD frontend)
|
||||
cd web/frontend && npm install && npm run build
|
||||
cd web/frontend && npm ci && npm run build
|
||||
|
||||
72
README.md
72
README.md
@@ -22,19 +22,23 @@ switching from PHP Symfony to a Golang based solution are explained
|
||||
## Overview
|
||||
|
||||
This is a Golang web backend for the ClusterCockpit job-specific performance
|
||||
monitoring framework. It provides a REST API for integrating ClusterCockpit with
|
||||
an HPC cluster batch system and external analysis scripts. Data exchange between
|
||||
the web front-end and the back-end is based on a GraphQL API. The web frontend
|
||||
is also served by the backend using [Svelte](https://svelte.dev/) components.
|
||||
Layout and styling are based on [Bootstrap 5](https://getbootstrap.com/) using
|
||||
monitoring framework. It provides a REST API and an optional NATS-based messaging
|
||||
API for integrating ClusterCockpit with an HPC cluster batch system and external
|
||||
analysis scripts. Data exchange between the web front-end and the back-end is
|
||||
based on a GraphQL API. The web frontend is also served by the backend using
|
||||
[Svelte](https://svelte.dev/) components. Layout and styling are based on
|
||||
[Bootstrap 5](https://getbootstrap.com/) using
|
||||
[Bootstrap Icons](https://icons.getbootstrap.com/).
|
||||
|
||||
The backend uses [SQLite 3](https://sqlite.org/) as a relational SQL database by
|
||||
default. Optionally it can use a MySQL/MariaDB database server. While there are
|
||||
metric data backends for the InfluxDB and Prometheus time series databases, the
|
||||
only tested and supported setup is to use cc-metric-store as the metric data
|
||||
backend. Documentation on how to integrate ClusterCockpit with other time series
|
||||
databases will be added in the future.
|
||||
The backend uses [SQLite 3](https://sqlite.org/) as the relational SQL database.
|
||||
While there are metric data backends for the InfluxDB and Prometheus time series
|
||||
databases, the only tested and supported setup is to use cc-metric-store as the
|
||||
metric data backend. Documentation on how to integrate ClusterCockpit with other
|
||||
time series databases will be added in the future.
|
||||
|
||||
For real-time integration with HPC systems, the backend can subscribe to
|
||||
[NATS](https://nats.io/) subjects to receive job start/stop events and node
|
||||
state updates, providing an alternative to REST API polling.
|
||||
|
||||
Completed batch jobs are stored in a file-based job archive according to
|
||||
[this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive).
|
||||
@@ -131,27 +135,59 @@ ln -s <your-existing-job-archive> ./var/job-archive
|
||||
|
||||
## Project file structure
|
||||
|
||||
- [`.github/`](https://github.com/ClusterCockpit/cc-backend/tree/master/.github)
|
||||
GitHub Actions workflows and dependabot configuration for CI/CD.
|
||||
- [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api)
|
||||
contains the API schema files for the REST and GraphQL APIs. The REST API is
|
||||
documented in the OpenAPI 3.0 format in
|
||||
[./api/openapi.yaml](./api/openapi.yaml).
|
||||
[./api/swagger.yaml](./api/swagger.yaml). The GraphQL schema is in
|
||||
[./api/schema.graphqls](./api/schema.graphqls).
|
||||
- [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend)
|
||||
contains `main.go` for the main application.
|
||||
contains the main application entry point and CLI implementation.
|
||||
- [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs)
|
||||
contains documentation about configuration and command line options and required
|
||||
environment variables. A sample configuration file is provided.
|
||||
- [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs)
|
||||
contains more in-depth documentation.
|
||||
environment variables. Sample configuration files are provided.
|
||||
- [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init)
|
||||
contains an example of setting up systemd for production use.
|
||||
- [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal)
|
||||
contains library source code that is not intended for use by others.
|
||||
- [`api`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/api)
|
||||
REST API handlers and NATS integration
|
||||
- [`archiver`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/archiver)
|
||||
Job archiving functionality
|
||||
- [`auth`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/auth)
|
||||
Authentication (local, LDAP, OIDC) and JWT token handling
|
||||
- [`config`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/config)
|
||||
Configuration management and validation
|
||||
- [`graph`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/graph)
|
||||
GraphQL schema and resolvers
|
||||
- [`importer`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/importer)
|
||||
Job data import and database initialization
|
||||
- [`metricdispatch`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricdispatch)
|
||||
Dispatches metric data loading to appropriate backends
|
||||
- [`repository`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/repository)
|
||||
Database repository layer for jobs and metadata
|
||||
- [`routerConfig`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/routerConfig)
|
||||
HTTP router configuration and middleware
|
||||
- [`tagger`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/tagger)
|
||||
Job classification and application detection
|
||||
- [`taskmanager`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/taskmanager)
|
||||
Background task management and scheduled jobs
|
||||
- [`metricstoreclient`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricstoreclient)
|
||||
Client for cc-metric-store queries
|
||||
- [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg)
|
||||
contains Go packages that can be used by other projects.
|
||||
- [`archive`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/archive)
|
||||
Job archive backend implementations (filesystem, S3, SQLite)
|
||||
- [`metricstore`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/metricstore)
|
||||
In-memory metric data store with checkpointing and metric loading
|
||||
- [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
|
||||
Additional command line helper tools.
|
||||
- [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
|
||||
Commands for getting infos about and existing job archive.
|
||||
Commands for getting infos about an existing job archive, importing jobs
|
||||
between archive backends, and converting archives between JSON and Parquet formats.
|
||||
- [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
|
||||
Tool for migrating job archives between formats.
|
||||
- [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)
|
||||
Tool to convert external pubkey for use in `cc-backend`.
|
||||
- [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair)
|
||||
@@ -163,7 +199,7 @@ ln -s <your-existing-job-archive> ./var/job-archive
|
||||
- [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend)
|
||||
Svelte components and static assets for the frontend UI
|
||||
- [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates)
|
||||
Server-side Go templates
|
||||
Server-side Go templates, including monitoring views
|
||||
- [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml)
|
||||
Configures the behaviour and generation of
|
||||
[gqlgen](https://github.com/99designs/gqlgen).
|
||||
|
||||
284
ReleaseNotes.md
284
ReleaseNotes.md
@@ -1,47 +1,277 @@
|
||||
# `cc-backend` version 1.4.4
|
||||
# `cc-backend` version 1.5.0
|
||||
|
||||
Supports job archive version 2 and database version 8.
|
||||
Supports job archive version 3 and database version 10.
|
||||
|
||||
This is a bug fix release of `cc-backend`, the API backend and frontend
|
||||
This is a feature release of `cc-backend`, the API backend and frontend
|
||||
implementation of ClusterCockpit.
|
||||
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
|
||||
|
||||
## Breaking changes
|
||||
|
||||
The option `apiAllowedIPs` is now a required configuration attribute in
|
||||
`config.json`. This option restricts access to the admin API.
|
||||
### Configuration changes
|
||||
|
||||
To retain the previous behavior that the API is per default accessible from
|
||||
everywhere set:
|
||||
- **JSON attribute naming**: All JSON configuration attributes now use `kebab-case`
|
||||
style consistently (e.g., `api-allowed-ips` instead of `apiAllowedIPs`).
|
||||
Update your `config.json` accordingly.
|
||||
- **Removed `disable-archive` option**: This obsolete configuration option has been removed.
|
||||
- **Removed `clusters` config section**: The separate clusters configuration section
|
||||
has been removed. Cluster information is now derived from the job archive.
|
||||
- **`apiAllowedIPs` is now optional**: If not specified, defaults to not
|
||||
restricted.
|
||||
|
||||
### Architecture changes
|
||||
|
||||
- **Web framework replaced**: Migrated from `gorilla/mux` to `chi` as the HTTP
|
||||
router. This should be transparent to users but affects how middleware and
|
||||
routes are composed. A proper 404 handler is now in place.
|
||||
- **MetricStore moved**: The `metricstore` package has been moved from `internal/`
|
||||
to `pkg/` as it is now part of the public API.
|
||||
- **MySQL/MariaDB support removed**: Only SQLite is now supported as the database backend.
|
||||
- **Archive to Cleanup renaming**: Archive-related functions have been refactored
|
||||
and renamed to "Cleanup" for clarity.
|
||||
- **`minRunningFor` filter removed**: This undocumented filter has been removed
|
||||
from the API and frontend.
|
||||
|
||||
### Dependency changes
|
||||
|
||||
- **cc-lib v2.5.1**: Switched to cc-lib version 2 with updated APIs (currently at v2.5.1)
|
||||
- **cclib NATS client**: Now using the cclib NATS client implementation
|
||||
- Removed obsolete `util.Float` usage from cclib
|
||||
|
||||
## Major new features
|
||||
|
||||
### NATS API Integration
|
||||
|
||||
- **Real-time job events**: Subscribe to job start/stop events via NATS
|
||||
- **Node state updates**: Receive real-time node state changes via NATS
|
||||
- **Configurable subjects**: NATS API subjects are now configurable via `api-subjects`
|
||||
- **Deadlock fixes**: Improved NATS client stability and graceful shutdown
|
||||
|
||||
### Public Dashboard
|
||||
|
||||
- **Public-facing interface**: New public dashboard route for external users
|
||||
- **DoubleMetricPlot component**: New visualization component for comparing metrics
|
||||
- **Improved layout**: Reviewed and optimized dashboard layouts for better readability
|
||||
|
||||
### Enhanced Node Management
|
||||
|
||||
- **Node state tracking**: New node table in database with timestamp tracking
|
||||
- **Node state filtering**: Filter jobs by node state in systems view
|
||||
- **Node list enhancements**: Improved paging, filtering, and continuous scroll support
|
||||
- **Nodestate retention and archiving**: Node state data is now subject to configurable
|
||||
retention policies and can be archived to Parquet format for long-term storage
|
||||
- **Faulty node metric tracking**: Faulty node state metric lists are persisted to the database
|
||||
|
||||
### Health Monitoring
|
||||
|
||||
- **Health status dashboard**: New dedicated "Health" tab in the status details view
|
||||
showing per-node metric health across the cluster
|
||||
- **CCMS health check**: Support for querying health status of external
|
||||
cc-metric-store (CCMS) instances via the API
|
||||
- **GraphQL health endpoints**: New GraphQL queries and resolvers for health data
|
||||
- **Cluster/subcluster filter**: Filter health status view by cluster or subcluster
|
||||
|
||||
### Log Viewer
|
||||
|
||||
- **Web-based log viewer**: New log viewer page in the admin interface for inspecting
|
||||
backend log output directly from the browser without shell access
|
||||
- **Accessible from header**: Quick access link from the navigation header
|
||||
|
||||
### MetricStore Improvements
|
||||
|
||||
- **Memory tracking worker**: New worker for CCMS memory usage tracking
|
||||
- **Dynamic retention**: Support for job specific dynamic retention times
|
||||
- **Improved compression**: Transparent compression for job archive imports
|
||||
- **Parallel processing**: Parallelized Iter function in all archive backends
|
||||
|
||||
### Job Tagging System
|
||||
|
||||
- **Job tagger option**: Enable automatic job tagging via configuration flag
|
||||
- **Application detection**: Automatic detection of applications (MATLAB, GROMACS, etc.)
|
||||
- **Job classification**: Automatic detection of pathological jobs
|
||||
- **omit-tagged**: Option to exclude tagged jobs from retention/cleanup operations (`none`, `all`, or `user`)
|
||||
- **Admin UI trigger**: Taggers can be run on-demand from the admin web interface
|
||||
without restarting the backend
|
||||
|
||||
### Archive Backends
|
||||
|
||||
- **Parquet archive format**: New Parquet file format for job archiving, providing
|
||||
columnar storage with efficient compression for analytical workloads
|
||||
- **S3 backend**: Full support for S3-compatible object storage
|
||||
- **SQLite backend**: Full support for SQLite backend using blobs
|
||||
- **Performance improvements**: Fixed performance bugs in archive backends
|
||||
- **Better error handling**: Improved error messages and fallback handling
|
||||
- **Zstd compression**: Parquet writers use zstd compression for better
|
||||
compression ratios compared to the previous snappy default
|
||||
- **Optimized sort order**: Job and nodestate Parquet files are sorted by
|
||||
cluster, subcluster, and start time for efficient range queries
|
||||
|
||||
### Unified Archive Retention and Format Conversion
|
||||
|
||||
- **Uniform retention policy**: Job archive retention now supports both JSON and
|
||||
Parquet as target formats under a single, consistent policy configuration
|
||||
- **Archive manager tool**: The `tools/archive-manager` utility now supports
|
||||
format conversion between JSON and Parquet job archives
|
||||
- **Parquet reader**: Full Parquet archive reader implementation for reading back
|
||||
archived job data
|
||||
|
||||
## New features and improvements
|
||||
|
||||
### Frontend
|
||||
|
||||
- **Loading indicators**: Added loading indicators to status detail and job lists
|
||||
- **Job info layout**: Reviewed and improved job info row layout
|
||||
- **Metric selection**: Enhanced metric selection with drag-and-drop fixes
|
||||
- **Filter presets**: Move list filter preset to URL for easy sharing
|
||||
- **Job comparison**: Improved job comparison views and plots
|
||||
- **Subcluster reactivity**: Job list now reacts to subcluster filter changes
|
||||
- **Short jobs quick selection**: New "Short jobs" quick-filter button in job lists
|
||||
replaces the removed undocumented `minRunningFor` filter
|
||||
- **Row plot cursor sync**: Cursor position is now synchronized across all metric
|
||||
plots in a job list row for easier cross-metric comparison
|
||||
- **Disabled metrics handling**: Improved handling and display of disabled metrics
|
||||
across job view, node view, and list rows
|
||||
- **"Not configured" info cards**: Informational cards shown when optional features
|
||||
are not yet configured
|
||||
- **Frontend dependencies**: Bumped frontend dependencies to latest versions
|
||||
- **Svelte 5 compatibility**: Fixed Svelte state warnings and compatibility issues
|
||||
|
||||
### Backend
|
||||
|
||||
- **Progress bars**: Import function now shows progress during long operations
|
||||
- **Better logging**: Improved logging with appropriate log levels throughout
|
||||
- **Graceful shutdown**: Fixed shutdown timeout bugs and hanging issues
|
||||
- **Configuration defaults**: Sensible defaults for most configuration options
|
||||
- **Documentation**: Extensive documentation improvements across packages
|
||||
- **Server flag in systemd unit**: Example systemd unit now includes the `-server` flag
|
||||
|
||||
### Security
|
||||
|
||||
- **LDAP security hardening**: Improved input validation, connection handling, and
|
||||
error reporting in the LDAP authenticator
|
||||
- **OIDC security hardening**: Stricter token validation and improved error handling
|
||||
in the OIDC authenticator
|
||||
- **Auth schema extensions**: Additional schema fields for improved auth configuration
|
||||
|
||||
### API improvements
|
||||
|
||||
- **Role-based metric visibility**: Metrics can now have role-based access control
|
||||
- **Job exclusivity filter**: New filter for exclusive vs. shared jobs
|
||||
- **Improved error messages**: Better error messages and documentation in REST API
|
||||
- **GraphQL enhancements**: Improved GraphQL queries and resolvers
|
||||
- **Stop job lookup order**: Reversed lookup order in stop job requests for
|
||||
more reliable job matching (cluster+jobId first, then jobId alone)
|
||||
|
||||
### Performance
|
||||
|
||||
- **Database indices**: Optimized SQLite indices for better query performance
|
||||
- **Job cache**: Introduced caching table for faster job inserts
|
||||
- **Parallel imports**: Archive imports now run in parallel where possible
|
||||
- **External tool integration**: Optimized use of external tools (fd) for better performance
|
||||
- **Node repository queries**: Reviewed and optimized node repository SQL queries
|
||||
- **Buffer pool**: Resized and pooled internal buffers for better memory reuse
|
||||
|
||||
### Developer experience
|
||||
|
||||
- **AI agent guidelines**: Added documentation for AI coding agents (AGENTS.md, CLAUDE.md)
|
||||
- **Example API payloads**: Added example JSON API payloads for testing
|
||||
- **Unit tests**: Added more unit tests for NATS API, node repository, and other components
|
||||
- **Test improvements**: Better test coverage; test DB is now copied before unit tests
|
||||
to avoid state pollution between test runs
|
||||
- **Parquet writer tests**: Comprehensive tests for Parquet archive writing and conversion
|
||||
|
||||
## Bug fixes
|
||||
|
||||
- Fixed nodelist paging issues
|
||||
- Fixed metric select drag and drop functionality
|
||||
- Fixed render race conditions in nodeList
|
||||
- Fixed tag count grouping including type
|
||||
- Fixed wrong metricstore schema (missing comma)
|
||||
- Fixed configuration issues causing shutdown hangs
|
||||
- Fixed deadlock when NATS is not configured
|
||||
- Fixed archive backend performance bugs
|
||||
- Fixed continuous scroll buildup on refresh
|
||||
- Improved footprint calculation logic
|
||||
- Fixed polar plot data query decoupling
|
||||
- Fixed missing resolution parameter handling
|
||||
- Fixed node table initialization fallback
|
||||
- Fixed reactivity key placement in nodeList
|
||||
- Fixed nodeList resolver data handling and increased nodestate filter cutoff
|
||||
- Fixed job always being transferred to main job table before archiving
|
||||
- Fixed AppTagger error handling and logging
|
||||
- Fixed log endpoint formatting and correctness
|
||||
- Fixed automatic refresh in metric status tab
|
||||
- Fixed NULL value handling in `health_state` and `health_metrics` columns
|
||||
- Fixed bugs related to `job_cache` IDs being used in the main job table
|
||||
- Fixed SyncJobs bug causing start job hooks to be called with wrong (cache) IDs
|
||||
- Fixed 404 handler route for sub-routers
|
||||
|
||||
## Configuration changes
|
||||
|
||||
### New configuration options
|
||||
|
||||
```json
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
]
|
||||
{
|
||||
"main": {
|
||||
"enable-job-taggers": true,
|
||||
"resampling": {
|
||||
"minimum-points": 600,
|
||||
"trigger": 180,
|
||||
"resolutions": [240, 60]
|
||||
},
|
||||
"api-subjects": {
|
||||
"subject-job-event": "cc.job.event",
|
||||
"subject-node-state": "cc.node.state"
|
||||
}
|
||||
},
|
||||
"nats": {
|
||||
"address": "nats://0.0.0.0:4222",
|
||||
"username": "root",
|
||||
"password": "root"
|
||||
},
|
||||
"cron": {
|
||||
"commit-job-worker": "1m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
"metric-store": {
|
||||
"cleanup": {
|
||||
"mode": "archive",
|
||||
"interval": "48h",
|
||||
"directory": "./var/archive"
|
||||
}
|
||||
},
|
||||
"archive": {
|
||||
"retention": {
|
||||
"policy": "delete",
|
||||
"age": "6months",
|
||||
"target-format": "parquet"
|
||||
}
|
||||
},
|
||||
"nodestate": {
|
||||
"retention": {
|
||||
"policy": "archive",
|
||||
"age": "30d",
|
||||
"archive-path": "./var/nodestate-archive"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Breaking changes for minor release 1.4.x
|
||||
## Migration notes
|
||||
|
||||
- You need to perform a database migration. Depending on your database size the
|
||||
migration might require several hours!
|
||||
- You need to adapt the `cluster.json` configuration files in the job-archive,
|
||||
add new required attributes to the metric list and after that edit
|
||||
`./job-archive/version.txt` to version 2. Only metrics that have the footprint
|
||||
attribute set can be filtered and show up in the footprint UI and polar plot.
|
||||
- Continuous scrolling is default now in all job lists. You can change this back
|
||||
to paging globally, also every user can configure to use paging or continuous
|
||||
scrolling individually.
|
||||
- Tags have a scope now. Existing tags will get global scope in the database
|
||||
migration.
|
||||
|
||||
## New features
|
||||
|
||||
- Enable to delete tags from the web interface
|
||||
- Review and update your `config.json` to use kebab-case attribute names
|
||||
- If using NATS, configure the new `nats` and `api-subjects` sections
|
||||
- If using S3 archive backend, configure the new `archive` section options
|
||||
- Test the new public dashboard at `/public` route
|
||||
- Review cron worker configuration if you need different frequencies
|
||||
- If using the archive retention feature, configure the `target-format` option
|
||||
to choose between `json` (default) and `parquet` output formats
|
||||
- Consider enabling nodestate retention if you track node states over time
|
||||
|
||||
## Known issues
|
||||
|
||||
- Currently energy footprint metrics of type energy are ignored for calculating
|
||||
total energy.
|
||||
- Resampling for running jobs only works with cc-metric-store
|
||||
- With energy footprint metrics of type power the unit is ignored and it is
|
||||
assumed the metric has the unit Watt.
|
||||
|
||||
@@ -19,6 +19,7 @@ type Node {
|
||||
schedulerState: SchedulerState!
|
||||
healthState: MonitoringState!
|
||||
metaData: Any
|
||||
healthData: Any
|
||||
}
|
||||
|
||||
type NodeStates {
|
||||
@@ -164,6 +165,13 @@ type JobMetricWithName {
|
||||
metric: JobMetric!
|
||||
}
|
||||
|
||||
type ClusterMetricWithName {
|
||||
name: String!
|
||||
unit: Unit
|
||||
timestep: Int!
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type JobMetric {
|
||||
unit: Unit
|
||||
timestep: Int!
|
||||
@@ -267,6 +275,11 @@ type NodeMetrics {
|
||||
metrics: [JobMetricWithName!]!
|
||||
}
|
||||
|
||||
type ClusterMetrics {
|
||||
nodeCount: Int!
|
||||
metrics: [ClusterMetricWithName!]!
|
||||
}
|
||||
|
||||
type NodesResultList {
|
||||
items: [NodeMetrics!]!
|
||||
offset: Int
|
||||
@@ -316,6 +329,7 @@ type Query {
|
||||
## Node Queries New
|
||||
node(id: ID!): Node
|
||||
nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
|
||||
nodesWithMeta(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
|
||||
nodeStates(filter: [NodeFilter!]): [NodeStates!]!
|
||||
nodeStatesTimed(filter: [NodeFilter!], type: String!): [NodeStatesTimed!]!
|
||||
|
||||
@@ -385,6 +399,13 @@ type Query {
|
||||
page: PageRequest
|
||||
resolution: Int
|
||||
): NodesResultList!
|
||||
|
||||
clusterMetrics(
|
||||
cluster: String!
|
||||
metrics: [String!]
|
||||
from: Time!
|
||||
to: Time!
|
||||
): ClusterMetrics!
|
||||
}
|
||||
|
||||
type Mutation {
|
||||
@@ -410,7 +431,7 @@ type TimeRangeOutput {
|
||||
input NodeFilter {
|
||||
hostname: StringInput
|
||||
cluster: StringInput
|
||||
subcluster: StringInput
|
||||
subCluster: StringInput
|
||||
schedulerState: SchedulerState
|
||||
healthState: MonitoringState
|
||||
timeStart: Int
|
||||
@@ -425,6 +446,7 @@ input JobFilter {
|
||||
project: StringInput
|
||||
jobName: StringInput
|
||||
cluster: StringInput
|
||||
subCluster: StringInput
|
||||
partition: StringInput
|
||||
duration: IntRange
|
||||
energy: FloatRange
|
||||
@@ -439,6 +461,7 @@ input JobFilter {
|
||||
state: [JobState!]
|
||||
metricStats: [MetricStatItem!]
|
||||
shared: String
|
||||
schedule: String
|
||||
node: StringInput
|
||||
}
|
||||
|
||||
|
||||
1235
api/swagger.json
1235
api/swagger.json
File diff suppressed because it is too large
Load Diff
755
api/swagger.yaml
755
api/swagger.yaml
File diff suppressed because it is too large
Load Diff
@@ -33,6 +33,6 @@ func cliInit() {
|
||||
flag.StringVar(&flagDelUser, "del-user", "", "Remove a existing user. Argument format: <username>")
|
||||
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`")
|
||||
flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `<path-to-meta.json>:<path-to-data.json>,...`")
|
||||
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info (default), warn, err, crit]`")
|
||||
flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info , warn (default), err, crit]`")
|
||||
flag.Parse()
|
||||
}
|
||||
|
||||
@@ -12,11 +12,10 @@ import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||
)
|
||||
|
||||
const envString = `
|
||||
@@ -35,20 +34,20 @@ const configString = `
|
||||
"addr": "127.0.0.1:8080",
|
||||
"short-running-jobs-duration": 300,
|
||||
"resampling": {
|
||||
"minimumPoints": 600,
|
||||
"trigger": 180,
|
||||
"minimum-points": 600,
|
||||
"trigger": 300,
|
||||
"resolutions": [
|
||||
240,
|
||||
60
|
||||
]
|
||||
},
|
||||
"apiAllowedIPs": [
|
||||
"api-allowed-ips": [
|
||||
"*"
|
||||
],
|
||||
"emission-constant": 317
|
||||
},
|
||||
"cron": {
|
||||
"commit-job-worker": "2m",
|
||||
"commit-job-worker": "1m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
@@ -60,31 +59,7 @@ const configString = `
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
}
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "name",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2023-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
`
|
||||
|
||||
@@ -105,15 +80,15 @@ func initEnv() {
|
||||
cclog.Abortf("Could not create default ./var folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
err := repository.MigrateDB("sqlite3", "./var/job.db")
|
||||
err := repository.MigrateDB("./var/job.db")
|
||||
if err != nil {
|
||||
cclog.Abortf("Could not initialize default sqlite3 database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
cclog.Abortf("Could not initialize default SQLite database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
if err := os.Mkdir("var/job-archive", 0o777); err != nil {
|
||||
cclog.Abortf("Could not create default ./var/job-archive folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
archiveCfg := "{\"kind\": \"file\",\"path\": \"./var/job-archive\"}"
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||
cclog.Abortf("Could not initialize job-archive, exited.\nError: %s\n", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,22 +24,22 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/tagger"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/taskmanager"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
"github.com/ClusterCockpit/cc-backend/web"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/runtimeEnv"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/nats"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/runtime"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||
"github.com/google/gops/agent"
|
||||
"github.com/joho/godotenv"
|
||||
|
||||
_ "github.com/go-sql-driver/mysql"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
@@ -103,46 +103,41 @@ func initConfiguration() error {
|
||||
return fmt.Errorf("main configuration must be present")
|
||||
}
|
||||
|
||||
clustercfg := ccconf.GetPackageConfig("clusters")
|
||||
if clustercfg == nil {
|
||||
return fmt.Errorf("cluster configuration must be present")
|
||||
}
|
||||
|
||||
config.Init(cfg, clustercfg)
|
||||
config.Init(cfg)
|
||||
return nil
|
||||
}
|
||||
|
||||
func initDatabase() error {
|
||||
repository.Connect(config.Keys.DBDriver, config.Keys.DB)
|
||||
repository.Connect(config.Keys.DB)
|
||||
return nil
|
||||
}
|
||||
|
||||
func handleDatabaseCommands() error {
|
||||
if flagMigrateDB {
|
||||
err := repository.MigrateDB(config.Keys.DBDriver, config.Keys.DB)
|
||||
err := repository.MigrateDB(config.Keys.DB)
|
||||
if err != nil {
|
||||
return fmt.Errorf("migrating database to version %d: %w", repository.Version, err)
|
||||
}
|
||||
cclog.Exitf("MigrateDB Success: Migrated '%s' database at location '%s' to version %d.\n",
|
||||
config.Keys.DBDriver, config.Keys.DB, repository.Version)
|
||||
cclog.Exitf("MigrateDB Success: Migrated SQLite database at '%s' to version %d.\n",
|
||||
config.Keys.DB, repository.Version)
|
||||
}
|
||||
|
||||
if flagRevertDB {
|
||||
err := repository.RevertDB(config.Keys.DBDriver, config.Keys.DB)
|
||||
err := repository.RevertDB(config.Keys.DB)
|
||||
if err != nil {
|
||||
return fmt.Errorf("reverting database to version %d: %w", repository.Version-1, err)
|
||||
}
|
||||
cclog.Exitf("RevertDB Success: Reverted '%s' database at location '%s' to version %d.\n",
|
||||
config.Keys.DBDriver, config.Keys.DB, repository.Version-1)
|
||||
cclog.Exitf("RevertDB Success: Reverted SQLite database at '%s' to version %d.\n",
|
||||
config.Keys.DB, repository.Version-1)
|
||||
}
|
||||
|
||||
if flagForceDB {
|
||||
err := repository.ForceDB(config.Keys.DBDriver, config.Keys.DB)
|
||||
err := repository.ForceDB(config.Keys.DB)
|
||||
if err != nil {
|
||||
return fmt.Errorf("forcing database to version %d: %w", repository.Version, err)
|
||||
}
|
||||
cclog.Exitf("ForceDB Success: Forced '%s' database at location '%s' to version %d.\n",
|
||||
config.Keys.DBDriver, config.Keys.DB, repository.Version)
|
||||
cclog.Exitf("ForceDB Success: Forced SQLite database at '%s' to version %d.\n",
|
||||
config.Keys.DB, repository.Version)
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -253,7 +248,7 @@ func generateJWT(authHandle *auth.Authentication, username string) error {
|
||||
return fmt.Errorf("getting user '%s': %w", username, err)
|
||||
}
|
||||
|
||||
if !user.HasRole(schema.RoleApi) {
|
||||
if !user.HasRole(schema.RoleAPI) {
|
||||
cclog.Warnf("JWT: User '%s' does not have the role 'api'. REST API endpoints will return error!\n", user.Username)
|
||||
}
|
||||
|
||||
@@ -262,25 +257,28 @@ func generateJWT(authHandle *auth.Authentication, username string) error {
|
||||
return fmt.Errorf("generating JWT for user '%s': %w", user.Username, err)
|
||||
}
|
||||
|
||||
cclog.Infof("JWT: Successfully generated JWT for user '%s': %s", user.Username, jwt)
|
||||
cclog.Printf("JWT: Successfully generated JWT for user '%s': %s\n", user.Username, jwt)
|
||||
return nil
|
||||
}
|
||||
|
||||
func initSubsystems() error {
|
||||
// Initialize nats client
|
||||
natsConfig := ccconf.GetPackageConfig("nats")
|
||||
if err := nats.Init(natsConfig); err != nil {
|
||||
cclog.Warnf("initializing (optional) nats client: %s", err.Error())
|
||||
}
|
||||
nats.Connect()
|
||||
|
||||
// Initialize job archive
|
||||
archiveCfg := ccconf.GetPackageConfig("archive")
|
||||
if archiveCfg == nil {
|
||||
cclog.Debug("Archive configuration not found, using default archive configuration")
|
||||
archiveCfg = json.RawMessage(defaultArchiveConfig)
|
||||
}
|
||||
if err := archive.Init(archiveCfg, config.Keys.DisableArchive); err != nil {
|
||||
if err := archive.Init(archiveCfg); err != nil {
|
||||
return fmt.Errorf("initializing archive: %w", err)
|
||||
}
|
||||
|
||||
// Initialize metricdata
|
||||
if err := metricdata.Init(); err != nil {
|
||||
return fmt.Errorf("initializing metricdata repository: %w", err)
|
||||
}
|
||||
|
||||
// Handle database re-initialization
|
||||
if flagReinitDB {
|
||||
if err := importer.InitDB(); err != nil {
|
||||
@@ -304,6 +302,8 @@ func initSubsystems() error {
|
||||
|
||||
// Apply tags if requested
|
||||
if flagApplyTags {
|
||||
tagger.Init()
|
||||
|
||||
if err := tagger.RunTaggers(); err != nil {
|
||||
return fmt.Errorf("running job taggers: %w", err)
|
||||
}
|
||||
@@ -315,13 +315,38 @@ func initSubsystems() error {
|
||||
func runServer(ctx context.Context) error {
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Start metric store if enabled
|
||||
if memorystore.InternalCCMSFlag {
|
||||
mscfg := ccconf.GetPackageConfig("metric-store")
|
||||
if mscfg == nil {
|
||||
return fmt.Errorf("metric store configuration must be present")
|
||||
// Initialize metric store if configuration is provided
|
||||
haveMetricstore := false
|
||||
mscfg := ccconf.GetPackageConfig("metric-store")
|
||||
if mscfg != nil {
|
||||
metrics := metricstore.BuildMetricList()
|
||||
metricstore.Init(mscfg, metrics, &wg)
|
||||
|
||||
// Inject repository as NodeProvider to break import cycle
|
||||
ms := metricstore.GetMemoryStore()
|
||||
jobRepo := repository.GetJobRepository()
|
||||
ms.SetNodeProvider(jobRepo)
|
||||
metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{}
|
||||
haveMetricstore = true
|
||||
} else {
|
||||
metricstore.MetricStoreHandle = nil
|
||||
cclog.Debug("missing internal metricstore configuration")
|
||||
}
|
||||
|
||||
// Initialize external metric stores if configuration is provided
|
||||
mscfg = ccconf.GetPackageConfig("metric-store-external")
|
||||
if mscfg != nil {
|
||||
err := metricdispatch.Init(mscfg)
|
||||
|
||||
if err != nil {
|
||||
cclog.Debugf("initializing metricdispatch: %v", err)
|
||||
} else {
|
||||
haveMetricstore = true
|
||||
}
|
||||
memorystore.Init(mscfg, &wg)
|
||||
}
|
||||
|
||||
if !haveMetricstore {
|
||||
return fmt.Errorf("missing metricstore configuration")
|
||||
}
|
||||
|
||||
// Start archiver and task manager
|
||||
@@ -344,13 +369,11 @@ func runServer(ctx context.Context) error {
|
||||
errChan := make(chan error, 1)
|
||||
|
||||
// Start HTTP server
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
wg.Go(func() {
|
||||
if err := srv.Start(ctx); err != nil {
|
||||
errChan <- err
|
||||
}
|
||||
}()
|
||||
})
|
||||
|
||||
// Handle shutdown signals
|
||||
wg.Add(1)
|
||||
@@ -364,7 +387,7 @@ func runServer(ctx context.Context) error {
|
||||
case <-ctx.Done():
|
||||
}
|
||||
|
||||
runtimeEnv.SystemdNotifiy(false, "Shutting down ...")
|
||||
runtime.SystemdNotify(false, "Shutting down ...")
|
||||
srv.Shutdown(ctx)
|
||||
util.FsWatcherShutdown()
|
||||
taskmanager.Shutdown()
|
||||
@@ -372,26 +395,41 @@ func runServer(ctx context.Context) error {
|
||||
|
||||
// Set GC percent if not configured
|
||||
if os.Getenv(envGOGC) == "" {
|
||||
debug.SetGCPercent(25)
|
||||
debug.SetGCPercent(15)
|
||||
}
|
||||
runtimeEnv.SystemdNotifiy(true, "running")
|
||||
runtime.SystemdNotify(true, "running")
|
||||
|
||||
// Wait for completion or error
|
||||
waitDone := make(chan struct{})
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(waitDone)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
<-waitDone
|
||||
close(errChan)
|
||||
}()
|
||||
|
||||
// Check for server startup errors
|
||||
// Wait for either:
|
||||
// 1. An error from server startup
|
||||
// 2. Completion of all goroutines (normal shutdown or crash)
|
||||
select {
|
||||
case err := <-errChan:
|
||||
// errChan will be closed when waitDone is closed, which happens
|
||||
// when all goroutines complete (either from normal shutdown or error)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
// Server started successfully, wait for completion
|
||||
if err := <-errChan; err != nil {
|
||||
return err
|
||||
// Give the server 100ms to start and report any immediate startup errors
|
||||
// After that, just wait for normal shutdown completion
|
||||
select {
|
||||
case err := <-errChan:
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case <-waitDone:
|
||||
// Normal shutdown completed
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package main provides the entry point for the ClusterCockpit backend server.
|
||||
// This file contains HTTP server setup, routing configuration, and
|
||||
// authentication middleware integration.
|
||||
@@ -13,7 +14,6 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
@@ -29,13 +29,15 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
"github.com/ClusterCockpit/cc-backend/web"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/runtimeEnv"
|
||||
"github.com/gorilla/handlers"
|
||||
"github.com/gorilla/mux"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/nats"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/runtime"
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/go-chi/chi/v5/middleware"
|
||||
"github.com/go-chi/cors"
|
||||
httpSwagger "github.com/swaggo/http-swagger"
|
||||
)
|
||||
|
||||
@@ -48,9 +50,10 @@ const (
|
||||
|
||||
// Server encapsulates the HTTP server state and dependencies
|
||||
type Server struct {
|
||||
router *mux.Router
|
||||
server *http.Server
|
||||
apiHandle *api.RestAPI
|
||||
router chi.Router
|
||||
server *http.Server
|
||||
restAPIHandle *api.RestAPI
|
||||
natsAPIHandle *api.NatsAPI
|
||||
}
|
||||
|
||||
func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
@@ -67,7 +70,7 @@ func NewServer(version, commit, buildDate string) (*Server, error) {
|
||||
buildInfo = web.Build{Version: version, Hash: commit, Buildtime: buildDate}
|
||||
|
||||
s := &Server{
|
||||
router: mux.NewRouter(),
|
||||
router: chi.NewRouter(),
|
||||
}
|
||||
|
||||
if err := s.init(); err != nil {
|
||||
@@ -103,7 +106,28 @@ func (s *Server) init() error {
|
||||
|
||||
authHandle := auth.GetAuthInstance()
|
||||
|
||||
s.apiHandle = api.New()
|
||||
// Middleware must be defined before routes in chi
|
||||
s.router.Use(func(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
start := time.Now()
|
||||
ww := middleware.NewWrapResponseWriter(rw, r.ProtoMajor)
|
||||
next.ServeHTTP(ww, r)
|
||||
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
r.Method, r.URL.RequestURI(),
|
||||
ww.Status(), float32(ww.BytesWritten())/1024,
|
||||
time.Since(start).Milliseconds())
|
||||
})
|
||||
})
|
||||
s.router.Use(middleware.Compress(5))
|
||||
s.router.Use(middleware.Recoverer)
|
||||
s.router.Use(cors.Handler(cors.Options{
|
||||
AllowCredentials: true,
|
||||
AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"},
|
||||
AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"},
|
||||
AllowedOrigins: []string{"*"},
|
||||
}))
|
||||
|
||||
s.restAPIHandle = api.New()
|
||||
|
||||
info := map[string]any{}
|
||||
info["hasOpenIDConnect"] = false
|
||||
@@ -114,11 +138,11 @@ func (s *Server) init() error {
|
||||
info["hasOpenIDConnect"] = true
|
||||
}
|
||||
|
||||
s.router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) {
|
||||
s.router.Get("/login", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
cclog.Debugf("##%v##", info)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info})
|
||||
}).Methods(http.MethodGet)
|
||||
})
|
||||
s.router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo})
|
||||
@@ -128,13 +152,6 @@ func (s *Server) init() error {
|
||||
web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo})
|
||||
})
|
||||
|
||||
secured := s.router.PathPrefix("/").Subrouter()
|
||||
securedapi := s.router.PathPrefix("/api").Subrouter()
|
||||
userapi := s.router.PathPrefix("/userapi").Subrouter()
|
||||
configapi := s.router.PathPrefix("/config").Subrouter()
|
||||
frontendapi := s.router.PathPrefix("/frontend").Subrouter()
|
||||
metricstoreapi := s.router.PathPrefix("/metricstore").Subrouter()
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
// Create login failure handler (used by both /login and /jwt-login)
|
||||
loginFailureHandler := func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
@@ -149,10 +166,10 @@ func (s *Server) init() error {
|
||||
})
|
||||
}
|
||||
|
||||
s.router.Handle("/login", authHandle.Login(loginFailureHandler)).Methods(http.MethodPost)
|
||||
s.router.Handle("/jwt-login", authHandle.Login(loginFailureHandler))
|
||||
s.router.Post("/login", authHandle.Login(loginFailureHandler).ServeHTTP)
|
||||
s.router.HandleFunc("/jwt-login", authHandle.Login(loginFailureHandler).ServeHTTP)
|
||||
|
||||
s.router.Handle("/logout", authHandle.Logout(
|
||||
s.router.Post("/logout", authHandle.Logout(
|
||||
http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
@@ -163,111 +180,158 @@ func (s *Server) init() error {
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
})
|
||||
}))).Methods(http.MethodPost)
|
||||
|
||||
secured.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.Auth(
|
||||
// On success;
|
||||
next,
|
||||
|
||||
// On failure:
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Authentication failed - ClusterCockpit",
|
||||
MsgType: "alert-danger",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
Redirect: r.RequestURI,
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
securedapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
userapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthUserAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
metricstoreapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthMetricStoreAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
configapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthConfigAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
|
||||
frontendapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthFrontendAPI(
|
||||
// On success;
|
||||
next,
|
||||
// On failure: JSON Response
|
||||
onFailureResponse)
|
||||
})
|
||||
})).ServeHTTP)
|
||||
}
|
||||
|
||||
if flagDev {
|
||||
s.router.Handle("/playground", playground.Handler("GraphQL playground", "/query"))
|
||||
s.router.PathPrefix("/swagger/").Handler(httpSwagger.Handler(
|
||||
httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet)
|
||||
s.router.Get("/swagger/*", httpSwagger.Handler(
|
||||
httpSwagger.URL("http://"+config.Keys.Addr+"/swagger/doc.json")))
|
||||
}
|
||||
secured.Handle("/query", graphQLServer)
|
||||
|
||||
// Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project.
|
||||
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
|
||||
routerConfig.HandleSearchBar(rw, r, buildInfo)
|
||||
// Secured routes (require authentication)
|
||||
s.router.Group(func(secured chi.Router) {
|
||||
if !config.Keys.DisableAuthentication {
|
||||
secured.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.Auth(
|
||||
next,
|
||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
||||
rw.WriteHeader(http.StatusUnauthorized)
|
||||
web.RenderTemplate(rw, "login.tmpl", &web.Page{
|
||||
Title: "Authentication failed - ClusterCockpit",
|
||||
MsgType: "alert-danger",
|
||||
Message: err.Error(),
|
||||
Build: buildInfo,
|
||||
Infos: info,
|
||||
Redirect: r.RequestURI,
|
||||
})
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
secured.Handle("/query", graphQLServer)
|
||||
|
||||
secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) {
|
||||
routerConfig.HandleSearchBar(rw, r, buildInfo)
|
||||
})
|
||||
|
||||
routerConfig.SetupRoutes(secured, buildInfo)
|
||||
})
|
||||
|
||||
// Mount all /monitoring/... and /api/... routes.
|
||||
routerConfig.SetupRoutes(secured, buildInfo)
|
||||
s.apiHandle.MountAPIRoutes(securedapi)
|
||||
s.apiHandle.MountUserAPIRoutes(userapi)
|
||||
s.apiHandle.MountConfigAPIRoutes(configapi)
|
||||
s.apiHandle.MountFrontendAPIRoutes(frontendapi)
|
||||
// API routes (JWT token auth)
|
||||
s.router.Route("/api", func(apiRouter chi.Router) {
|
||||
// Main API routes with API auth
|
||||
apiRouter.Group(func(securedapi chi.Router) {
|
||||
if !config.Keys.DisableAuthentication {
|
||||
securedapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthAPI(next, onFailureResponse)
|
||||
})
|
||||
}
|
||||
s.restAPIHandle.MountAPIRoutes(securedapi)
|
||||
})
|
||||
|
||||
if memorystore.InternalCCMSFlag {
|
||||
s.apiHandle.MountMetricStoreAPIRoutes(metricstoreapi)
|
||||
// Metric store API routes with separate auth
|
||||
apiRouter.Group(func(metricstoreapi chi.Router) {
|
||||
if !config.Keys.DisableAuthentication {
|
||||
metricstoreapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthMetricStoreAPI(next, onFailureResponse)
|
||||
})
|
||||
}
|
||||
s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi)
|
||||
})
|
||||
})
|
||||
|
||||
// User API routes
|
||||
s.router.Route("/userapi", func(userapi chi.Router) {
|
||||
if !config.Keys.DisableAuthentication {
|
||||
userapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthUserAPI(next, onFailureResponse)
|
||||
})
|
||||
}
|
||||
s.restAPIHandle.MountUserAPIRoutes(userapi)
|
||||
})
|
||||
|
||||
// Config API routes (uses Group with full paths to avoid shadowing
|
||||
// the /config page route that is registered in the secured group)
|
||||
s.router.Group(func(configapi chi.Router) {
|
||||
if !config.Keys.DisableAuthentication {
|
||||
configapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthConfigAPI(next, onFailureResponse)
|
||||
})
|
||||
}
|
||||
s.restAPIHandle.MountConfigAPIRoutes(configapi)
|
||||
})
|
||||
|
||||
// Frontend API routes
|
||||
s.router.Route("/frontend", func(frontendapi chi.Router) {
|
||||
if !config.Keys.DisableAuthentication {
|
||||
frontendapi.Use(func(next http.Handler) http.Handler {
|
||||
return authHandle.AuthFrontendAPI(next, onFailureResponse)
|
||||
})
|
||||
}
|
||||
s.restAPIHandle.MountFrontendAPIRoutes(frontendapi)
|
||||
})
|
||||
|
||||
if config.Keys.APISubjects != nil {
|
||||
s.natsAPIHandle = api.NewNatsAPI()
|
||||
if err := s.natsAPIHandle.StartSubscriptions(); err != nil {
|
||||
return fmt.Errorf("starting NATS subscriptions: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 404 handler for pages and API routes
|
||||
notFoundHandler := func(rw http.ResponseWriter, r *http.Request) {
|
||||
if strings.HasPrefix(r.URL.Path, "/api/") || strings.HasPrefix(r.URL.Path, "/userapi/") ||
|
||||
strings.HasPrefix(r.URL.Path, "/frontend/") || strings.HasPrefix(r.URL.Path, "/config/") {
|
||||
rw.Header().Set("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusNotFound)
|
||||
json.NewEncoder(rw).Encode(map[string]string{
|
||||
"status": "Resource not found",
|
||||
"error": "the requested endpoint does not exist",
|
||||
})
|
||||
return
|
||||
}
|
||||
rw.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
rw.WriteHeader(http.StatusNotFound)
|
||||
web.RenderTemplate(rw, "404.tmpl", &web.Page{
|
||||
Title: "Page Not Found",
|
||||
Build: buildInfo,
|
||||
})
|
||||
}
|
||||
|
||||
// Set NotFound on the router so chi uses it for all unmatched routes,
|
||||
// including those under subrouters like /api, /userapi, /frontend, etc.
|
||||
s.router.NotFound(notFoundHandler)
|
||||
|
||||
if config.Keys.EmbedStaticFiles {
|
||||
if i, err := os.Stat("./var/img"); err == nil {
|
||||
if i.IsDir() {
|
||||
cclog.Info("Use local directory for static images")
|
||||
s.router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
|
||||
s.router.Handle("/img/*", http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img"))))
|
||||
}
|
||||
}
|
||||
s.router.PathPrefix("/").Handler(http.StripPrefix("/", web.ServeFiles()))
|
||||
fileServer := http.StripPrefix("/", web.ServeFiles())
|
||||
s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
if web.StaticFileExists(r.URL.Path) {
|
||||
fileServer.ServeHTTP(rw, r)
|
||||
return
|
||||
}
|
||||
notFoundHandler(rw, r)
|
||||
}))
|
||||
} else {
|
||||
s.router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles)))
|
||||
staticDir := http.Dir(config.Keys.StaticFiles)
|
||||
fileServer := http.FileServer(staticDir)
|
||||
s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
f, err := staticDir.Open(r.URL.Path)
|
||||
if err == nil {
|
||||
f.Close()
|
||||
fileServer.ServeHTTP(rw, r)
|
||||
return
|
||||
}
|
||||
notFoundHandler(rw, r)
|
||||
}))
|
||||
}
|
||||
|
||||
s.router.Use(handlers.CompressHandler)
|
||||
s.router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true)))
|
||||
s.router.Use(handlers.CORS(
|
||||
handlers.AllowCredentials(),
|
||||
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}),
|
||||
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
|
||||
handlers.AllowedOrigins([]string{"*"})))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -278,20 +342,6 @@ const (
|
||||
)
|
||||
|
||||
func (s *Server) Start(ctx context.Context) error {
|
||||
handler := handlers.CustomLoggingHandler(io.Discard, s.router, func(_ io.Writer, params handlers.LogFormatterParams) {
|
||||
if strings.HasPrefix(params.Request.RequestURI, "/api/") {
|
||||
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
} else {
|
||||
cclog.Debugf("%s %s (%d, %.02fkb, %dms)",
|
||||
params.Request.Method, params.URL.RequestURI(),
|
||||
params.StatusCode, float32(params.Size)/1024,
|
||||
time.Since(params.TimeStamp).Milliseconds())
|
||||
}
|
||||
})
|
||||
|
||||
// Use configurable timeouts with defaults
|
||||
readTimeout := time.Duration(defaultReadTimeout) * time.Second
|
||||
writeTimeout := time.Duration(defaultWriteTimeout) * time.Second
|
||||
@@ -299,7 +349,7 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
s.server = &http.Server{
|
||||
ReadTimeout: readTimeout,
|
||||
WriteTimeout: writeTimeout,
|
||||
Handler: handler,
|
||||
Handler: s.router,
|
||||
Addr: config.Keys.Addr,
|
||||
}
|
||||
|
||||
@@ -338,7 +388,7 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
// Because this program will want to bind to a privileged port (like 80), the listener must
|
||||
// be established first, then the user can be changed, and after that,
|
||||
// the actual http server can be started.
|
||||
if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
|
||||
if err := runtime.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil {
|
||||
return fmt.Errorf("dropping privileges: %w", err)
|
||||
}
|
||||
|
||||
@@ -363,15 +413,18 @@ func (s *Server) Shutdown(ctx context.Context) {
|
||||
shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
nc := nats.GetClient()
|
||||
if nc != nil {
|
||||
nc.Close()
|
||||
}
|
||||
|
||||
// First shut down the server gracefully (waiting for all ongoing requests)
|
||||
if err := s.server.Shutdown(shutdownCtx); err != nil {
|
||||
cclog.Errorf("Server shutdown error: %v", err)
|
||||
}
|
||||
|
||||
// Archive all the metric store data
|
||||
if memorystore.InternalCCMSFlag {
|
||||
memorystore.Shutdown()
|
||||
}
|
||||
metricstore.Shutdown()
|
||||
|
||||
// Shutdown archiver with 10 second timeout for fast shutdown
|
||||
if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||
|
||||
@@ -1,91 +1,29 @@
|
||||
{
|
||||
"main": {
|
||||
"addr": "127.0.0.1:8080",
|
||||
"short-running-jobs-duration": 300,
|
||||
"resampling": {
|
||||
"minimumPoints": 600,
|
||||
"trigger": 180,
|
||||
"resolutions": [
|
||||
240,
|
||||
60
|
||||
]
|
||||
},
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
],
|
||||
"emission-constant": 317
|
||||
"addr": "127.0.0.1:8080"
|
||||
},
|
||||
"cron": {
|
||||
"commit-job-worker": "2m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
"commit-job-worker": "1m",
|
||||
"duration-worker": "3m",
|
||||
"footprint-worker": "5m"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
}
|
||||
},
|
||||
"clusters": [
|
||||
"metric-store-external": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store-internal",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store-internal",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"scope": "fritz",
|
||||
"url": "http://0.0.0.0:8082",
|
||||
"token": "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NzU3Nzg4NDQsImlhdCI6MTc2ODU3ODg0NCwicm9sZXMiOlsiYWRtaW4iLCJhcGkiXSwic3ViIjoiZGVtbyJ9._SDEW9WaUVXSBFmWqGhyIZXLoqoDU8F1hkfh4cXKIqF4yw7w50IUpfUBtwUFUOnoviFKoi563f6RAMC7XxeLDA"
|
||||
}
|
||||
],
|
||||
"metric-store": {
|
||||
"checkpoints": {
|
||||
"file-format": "avro",
|
||||
"interval": "1h",
|
||||
"directory": "./var/checkpoints",
|
||||
"restore": "48h"
|
||||
"interval": "12h"
|
||||
},
|
||||
"archive": {
|
||||
"interval": "1h",
|
||||
"directory": "./var/archive"
|
||||
},
|
||||
"retention-in-memory": "48h"
|
||||
"retention-in-memory": "48h",
|
||||
"memory-cap": 100
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
{
|
||||
"addr": "127.0.0.1:8080",
|
||||
"short-running-jobs-duration": 300,
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
},
|
||||
"db-driver": "mysql",
|
||||
"db": "clustercockpit:demo@tcp(127.0.0.1:3306)/clustercockpit",
|
||||
"enable-resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [600, 300, 120, 60]
|
||||
},
|
||||
"emission-constant": 317,
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": ""
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -5,46 +5,95 @@
|
||||
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem",
|
||||
"user": "clustercockpit",
|
||||
"group": "clustercockpit",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": ["*"],
|
||||
"api-allowed-ips": ["*"],
|
||||
"short-running-jobs-duration": 300,
|
||||
"enable-job-taggers": true,
|
||||
"nodestate-retention": {
|
||||
"policy": "move",
|
||||
"target-kind": "file",
|
||||
"target-path": "./var/nodestate-archive"
|
||||
},
|
||||
"resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [600, 300, 120, 60]
|
||||
"minimum-points": 600,
|
||||
"trigger": 180,
|
||||
"resolutions": [240, 60]
|
||||
},
|
||||
"api-subjects": {
|
||||
"subject-job-event": "cc.job.event",
|
||||
"subject-node-state": "cc.node.state"
|
||||
}
|
||||
},
|
||||
"nats": {
|
||||
"address": "nats://0.0.0.0:4222",
|
||||
"username": "root",
|
||||
"password": "root"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2000h"
|
||||
}
|
||||
},
|
||||
"cron": {
|
||||
"commit-job-worker": "2m",
|
||||
"commit-job-worker": "1m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "test",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "eyJhbGciOiJF-E-pQBQ"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"kind": "s3",
|
||||
"endpoint": "http://x.x.x.x",
|
||||
"bucket": "jobarchive",
|
||||
"access-key": "xx",
|
||||
"secret-key": "xx",
|
||||
"retention": {
|
||||
"policy": "move",
|
||||
"age": 365,
|
||||
"location": "./var/archive"
|
||||
}
|
||||
]
|
||||
},
|
||||
"metric-store-external": [
|
||||
{
|
||||
"scope": "*",
|
||||
"url": "http://x.x.x.x:8082",
|
||||
"token": "MySecret"
|
||||
},
|
||||
{
|
||||
"scope": "fritz",
|
||||
"url": "http://x.x.x.x:8084",
|
||||
"token": "MySecret"
|
||||
},
|
||||
{
|
||||
"scope": "fritz-spr1tb",
|
||||
"url": "http://x.x.x.x:8083",
|
||||
"token": "MySecret"
|
||||
},
|
||||
{
|
||||
"scope": "alex",
|
||||
"url": "http://x.x.x.x:8084",
|
||||
"token": "MySecret"
|
||||
}
|
||||
],
|
||||
"metric-store": {
|
||||
"checkpoints": {
|
||||
"interval": "12h",
|
||||
"directory": "./var/checkpoints"
|
||||
},
|
||||
"memory-cap": 100,
|
||||
"retention-in-memory": "48h",
|
||||
"cleanup": {
|
||||
"mode": "archive",
|
||||
"interval": "48h",
|
||||
"directory": "./var/archive"
|
||||
},
|
||||
"nats-subscriptions": [
|
||||
{
|
||||
"subscribe-to": "hpc-nats",
|
||||
"cluster-tag": "fritz"
|
||||
},
|
||||
{
|
||||
"subscribe-to": "hpc-nats",
|
||||
"cluster-tag": "alex"
|
||||
}
|
||||
]
|
||||
},
|
||||
"ui-file": "ui-config.json"
|
||||
}
|
||||
|
||||
|
||||
22
configs/startJobPayload.json
Normal file
22
configs/startJobPayload.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"cluster": "fritz",
|
||||
"jobId": 123000,
|
||||
"jobState": "running",
|
||||
"numAcc": 0,
|
||||
"numHwthreads": 72,
|
||||
"numNodes": 1,
|
||||
"partition": "main",
|
||||
"requestedMemory": 128000,
|
||||
"resources": [{ "hostname": "f0726" }],
|
||||
"startTime": 1649723812,
|
||||
"subCluster": "main",
|
||||
"submitTime": 1649723812,
|
||||
"user": "k106eb10",
|
||||
"project": "k106eb",
|
||||
"walltime": 86400,
|
||||
"metaData": {
|
||||
"slurmInfo": "JobId=398759\nJobName=myJob\nUserId=dummyUser\nGroupId=dummyGroup\nAccount=dummyAccount\nQOS=normal Requeue=False Restarts=0 BatchFlag=True\nTimeLimit=1439'\nSubmitTime=2023-02-09T14:10:18\nPartition=singlenode\nNodeList=xx\nNumNodes=xx NumCPUs=72 NumTasks=72 CPUs/Task=1\nNTasksPerNode:Socket:Core=0:None:None\nTRES_req=cpu=72,mem=250000M,node=1,billing=72\nTRES_alloc=cpu=72,node=1,billing=72\nCommand=myCmd\nWorkDir=myDir\nStdErr=\nStdOut=\n",
|
||||
"jobScript": "#!/bin/bash -l\n#SBATCH --job-name=dummy_job\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/dummy/\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\n\n#This is a dummy job script\n./mybinary\n",
|
||||
"jobName": "ams_pipeline"
|
||||
}
|
||||
}
|
||||
7
configs/stopJobPayload.json
Normal file
7
configs/stopJobPayload.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"cluster": "fritz",
|
||||
"jobId": 123000,
|
||||
"jobState": "completed",
|
||||
"startTime": 1649723812,
|
||||
"stopTime": 1649763839
|
||||
}
|
||||
419
configs/tagger/README.md
Normal file
419
configs/tagger/README.md
Normal file
@@ -0,0 +1,419 @@
|
||||
# Job Tagging Configuration
|
||||
|
||||
ClusterCockpit provides automatic job tagging functionality to classify and
|
||||
categorize jobs based on configurable rules. The tagging system consists of two
|
||||
main components:
|
||||
|
||||
1. **Application Detection** - Identifies which application a job is running
|
||||
2. **Job Classification** - Analyzes job performance characteristics and applies classification tags
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
configs/tagger/
|
||||
├── apps/ # Application detection patterns
|
||||
│ ├── vasp.txt
|
||||
│ ├── gromacs.txt
|
||||
│ └── ...
|
||||
└── jobclasses/ # Job classification rules
|
||||
├── parameters.json
|
||||
├── lowUtilization.json
|
||||
├── highload.json
|
||||
└── ...
|
||||
```
|
||||
|
||||
## Activating Tagger Rules
|
||||
|
||||
### Step 1: Copy Configuration Files
|
||||
|
||||
To activate tagging, review, adapt, and copy the configuration files from
|
||||
`configs/tagger/` to `var/tagger/`:
|
||||
|
||||
```bash
|
||||
# From the cc-backend root directory
|
||||
mkdir -p var/tagger
|
||||
cp -r configs/tagger/apps var/tagger/
|
||||
cp -r configs/tagger/jobclasses var/tagger/
|
||||
```
|
||||
|
||||
### Step 2: Enable Tagging in Configuration
|
||||
|
||||
Add or set the following configuration key in the `main` section of your `config.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"enable-job-taggers": true
|
||||
}
|
||||
```
|
||||
|
||||
**Important**: Automatic tagging is disabled by default. You must explicitly
|
||||
enable it by setting `enable-job-taggers: true` in the main configuration file.
|
||||
|
||||
### Step 3: Restart cc-backend
|
||||
|
||||
The tagger system automatically loads configuration from `./var/tagger/` at
|
||||
startup. After copying the files and enabling the feature, restart cc-backend:
|
||||
|
||||
```bash
|
||||
./cc-backend -server
|
||||
```
|
||||
|
||||
### Step 4: Verify Configuration Loaded
|
||||
|
||||
Check the logs for messages indicating successful configuration loading:
|
||||
|
||||
```
|
||||
[INFO] Setup file watch for ./var/tagger/apps
|
||||
[INFO] Setup file watch for ./var/tagger/jobclasses
|
||||
```
|
||||
|
||||
## How Tagging Works
|
||||
|
||||
### Automatic Tagging
|
||||
|
||||
When `enable-job-taggers` is set to `true` in the configuration, tags are
|
||||
automatically applied when:
|
||||
|
||||
- **Job Start**: Application detection runs immediately when a job starts
|
||||
- **Job Stop**: Job classification runs when a job completes
|
||||
|
||||
The system analyzes job metadata and metrics to determine appropriate tags.
|
||||
|
||||
**Note**: Automatic tagging only works for jobs that start or stop after the
|
||||
feature is enabled. Existing jobs are not automatically retagged.
|
||||
|
||||
### Manual Tagging (Retroactive)
|
||||
|
||||
To apply tags to existing jobs in the database, use the `-apply-tags` command
|
||||
line option:
|
||||
|
||||
```bash
|
||||
./cc-backend -apply-tags
|
||||
```
|
||||
|
||||
This processes all jobs in the database and applies current tagging rules. This
|
||||
is useful when:
|
||||
|
||||
- You have existing jobs that were created before tagging was enabled
|
||||
- You've added new tagging rules and want to apply them to historical data
|
||||
- You've modified existing rules and want to re-evaluate all jobs
|
||||
|
||||
### Hot Reload
|
||||
|
||||
The tagger system watches the configuration directories for changes. You can
|
||||
modify or add rules without restarting `cc-backend`:
|
||||
|
||||
- Changes to `var/tagger/apps/*` are detected automatically
|
||||
- Changes to `var/tagger/jobclasses/*` are detected automatically
|
||||
|
||||
## Application Detection
|
||||
|
||||
Application detection identifies which software a job is running by matching
|
||||
patterns in the job script.
|
||||
|
||||
### Configuration Format
|
||||
|
||||
Application patterns are stored in text files under `var/tagger/apps/`. Each
|
||||
file contains one or more regular expression patterns (one per line) that match
|
||||
against the job script.
|
||||
|
||||
**Example: `apps/vasp.txt`**
|
||||
|
||||
```
|
||||
vasp
|
||||
VASP
|
||||
```
|
||||
|
||||
### How It Works
|
||||
|
||||
1. When a job starts, the system retrieves the job script from metadata
|
||||
2. Each line in the app files is treated as a regex pattern
|
||||
3. Patterns are matched case-insensitively against the lowercased job script
|
||||
4. If a match is found, a tag of type `app` with the filename (without extension) is applied
|
||||
5. Only the first matching application is tagged
|
||||
|
||||
### Adding New Applications
|
||||
|
||||
1. Create a new file in `var/tagger/apps/` (e.g., `tensorflow.txt`)
|
||||
2. Add regex patterns, one per line:
|
||||
|
||||
```
|
||||
tensorflow
|
||||
tf\.keras
|
||||
import tensorflow
|
||||
```
|
||||
|
||||
3. The file is automatically detected and loaded
|
||||
|
||||
**Note**: The tag name will be the filename without the `.txt` extension (e.g., `tensorflow`).
|
||||
|
||||
## Job Classification
|
||||
|
||||
Job classification analyzes completed jobs based on their metrics and properties
|
||||
to identify performance issues or characteristics.
|
||||
|
||||
### Configuration Format
|
||||
|
||||
Job classification rules are defined in JSON files under
|
||||
`var/tagger/jobclasses/`. Each rule file defines:
|
||||
|
||||
- **Metrics required**: Which job metrics to analyze
|
||||
- **Requirements**: Pre-conditions that must be met
|
||||
- **Variables**: Computed values used in the rule
|
||||
- **Rule expression**: Boolean expression that determines if the rule matches
|
||||
- **Hint template**: Message displayed when the rule matches
|
||||
|
||||
### Parameters File
|
||||
|
||||
`jobclasses/parameters.json` defines shared threshold values used across multiple rules:
|
||||
|
||||
```json
|
||||
{
|
||||
"lowcpuload_threshold_factor": 0.9,
|
||||
"highmemoryusage_threshold_factor": 0.9,
|
||||
"job_min_duration_seconds": 600.0,
|
||||
"sampling_interval_seconds": 30.0
|
||||
}
|
||||
```
|
||||
|
||||
### Rule File Structure
|
||||
|
||||
**Example: `jobclasses/lowUtilization.json`**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Low resource utilization",
|
||||
"tag": "lowutilization",
|
||||
"parameters": ["job_min_duration_seconds"],
|
||||
"metrics": ["flops_any", "mem_bw"],
|
||||
"requirements": [
|
||||
"job.shared == \"none\"",
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "mem_bw_perc",
|
||||
"expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)"
|
||||
}
|
||||
],
|
||||
"rule": "flops_any.avg < flops_any.limits.alert",
|
||||
"hint": "Average flop rate {{.flops_any.avg}} falls below threshold {{.flops_any.limits.alert}}"
|
||||
}
|
||||
```
|
||||
|
||||
#### Field Descriptions
|
||||
|
||||
| Field | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------- |
|
||||
| `name` | Human-readable description of the rule |
|
||||
| `tag` | Tag identifier applied when the rule matches |
|
||||
| `parameters` | List of parameter names from `parameters.json` to include in rule environment |
|
||||
| `metrics` | List of metrics required for evaluation (must be present in job data) |
|
||||
| `requirements` | Boolean expressions that must all be true for the rule to be evaluated |
|
||||
| `variables` | Named expressions computed before evaluating the main rule |
|
||||
| `rule` | Boolean expression that determines if the job matches this classification |
|
||||
| `hint` | Go template string for generating a user-visible message |
|
||||
|
||||
### Expression Environment
|
||||
|
||||
Expressions in `requirements`, `variables`, and `rule` have access to:
|
||||
|
||||
**Job Properties:**
|
||||
|
||||
- `job.shared` - Shared node allocation type
|
||||
- `job.duration` - Job runtime in seconds
|
||||
- `job.numCores` - Number of CPU cores
|
||||
- `job.numNodes` - Number of nodes
|
||||
- `job.jobState` - Job completion state
|
||||
- `job.numAcc` - Number of accelerators
|
||||
- `job.smt` - SMT setting
|
||||
|
||||
**Metric Statistics (for each metric in `metrics`):**
|
||||
|
||||
- `<metric>.min` - Minimum value
|
||||
- `<metric>.max` - Maximum value
|
||||
- `<metric>.avg` - Average value
|
||||
- `<metric>.limits.peak` - Peak limit from cluster config
|
||||
- `<metric>.limits.normal` - Normal threshold
|
||||
- `<metric>.limits.caution` - Caution threshold
|
||||
- `<metric>.limits.alert` - Alert threshold
|
||||
|
||||
**Parameters:**
|
||||
|
||||
- All parameters listed in the `parameters` field
|
||||
|
||||
**Variables:**
|
||||
|
||||
- All variables defined in the `variables` array
|
||||
|
||||
### Expression Language
|
||||
|
||||
Rules use the [expr](https://github.com/expr-lang/expr) language for expressions. Supported operations:
|
||||
|
||||
- **Arithmetic**: `+`, `-`, `*`, `/`, `%`, `^`
|
||||
- **Comparison**: `==`, `!=`, `<`, `<=`, `>`, `>=`
|
||||
- **Logical**: `&&`, `||`, `!`
|
||||
- **Functions**: Standard math functions (see expr documentation)
|
||||
|
||||
### Hint Templates
|
||||
|
||||
Hints use Go's `text/template` syntax. Variables from the evaluation environment are accessible:
|
||||
|
||||
```
|
||||
{{.flops_any.avg}} # Access metric average
|
||||
{{.job.duration}} # Access job property
|
||||
{{.my_variable}} # Access computed variable
|
||||
```
|
||||
|
||||
### Adding New Classification Rules
|
||||
|
||||
1. Create a new JSON file in `var/tagger/jobclasses/` (e.g., `memoryLeak.json`)
|
||||
2. Define the rule structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Memory Leak Detection",
|
||||
"tag": "memory_leak",
|
||||
"parameters": ["memory_leak_slope_threshold"],
|
||||
"metrics": ["mem_used"],
|
||||
"requirements": ["job.duration > 3600"],
|
||||
"variables": [
|
||||
{
|
||||
"name": "mem_growth",
|
||||
"expr": "(mem_used.max - mem_used.min) / job.duration"
|
||||
}
|
||||
],
|
||||
"rule": "mem_growth > memory_leak_slope_threshold",
|
||||
"hint": "Memory usage grew by {{.mem_growth}} per second"
|
||||
}
|
||||
```
|
||||
|
||||
3. Add any new parameters to `parameters.json`
|
||||
4. The file is automatically detected and loaded
|
||||
|
||||
## Configuration Paths
|
||||
|
||||
The tagger system reads from these paths (relative to cc-backend working directory):
|
||||
|
||||
- **Application patterns**: `./var/tagger/apps/`
|
||||
- **Job classification rules**: `./var/tagger/jobclasses/`
|
||||
|
||||
These paths are defined as constants in the source code and cannot be changed without recompiling.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Tags Not Applied
|
||||
|
||||
1. **Check tagging is enabled**: Verify `enable-job-taggers: true` is set in `config.json`
|
||||
|
||||
2. **Check configuration exists**:
|
||||
|
||||
```bash
|
||||
ls -la var/tagger/apps
|
||||
ls -la var/tagger/jobclasses
|
||||
```
|
||||
|
||||
3. **Check logs for errors**:
|
||||
|
||||
```bash
|
||||
./cc-backend -server -loglevel debug
|
||||
```
|
||||
|
||||
4. **Verify file permissions**: Ensure cc-backend can read the configuration files
|
||||
|
||||
5. **For existing jobs**: Use `./cc-backend -apply-tags` to retroactively tag jobs
|
||||
|
||||
### Rules Not Matching
|
||||
|
||||
1. **Enable debug logging**: Set `loglevel: debug` to see detailed rule evaluation
|
||||
2. **Check requirements**: Ensure all requirements in the rule are satisfied
|
||||
3. **Verify metrics exist**: Classification rules require job metrics to be available
|
||||
4. **Check metric names**: Ensure metric names match those in your cluster configuration
|
||||
|
||||
### File Watch Not Working
|
||||
|
||||
If changes to configuration files aren't detected:
|
||||
|
||||
1. Restart cc-backend to reload all configuration
|
||||
2. Check filesystem supports file watching (network filesystems may not)
|
||||
3. Check logs for file watch setup messages
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Start Simple**: Begin with basic rules and refine based on results
|
||||
2. **Use Requirements**: Filter out irrelevant jobs early with requirements
|
||||
3. **Test Incrementally**: Add one rule at a time and verify behavior
|
||||
4. **Document Rules**: Use descriptive names and clear hint messages
|
||||
5. **Share Parameters**: Define common thresholds in `parameters.json` for consistency
|
||||
6. **Version Control**: Keep your `var/tagger/` configuration in version control
|
||||
7. **Backup Before Changes**: Test new rules on a copy before deploying to production
|
||||
|
||||
## Examples
|
||||
|
||||
### Simple Application Detection
|
||||
|
||||
**File: `var/tagger/apps/python.txt`**
|
||||
|
||||
```
|
||||
python
|
||||
python3
|
||||
\.py
|
||||
```
|
||||
|
||||
This detects jobs running Python scripts.
|
||||
|
||||
### Complex Classification Rule
|
||||
|
||||
**File: `var/tagger/jobclasses/cpuImbalance.json`**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "CPU Load Imbalance",
|
||||
"tag": "cpu_imbalance",
|
||||
"parameters": ["core_load_imbalance_threshold_factor"],
|
||||
"metrics": ["cpu_load"],
|
||||
"requirements": ["job.numCores > 1", "job.duration > 600"],
|
||||
"variables": [
|
||||
{
|
||||
"name": "load_variance",
|
||||
"expr": "(cpu_load.max - cpu_load.min) / cpu_load.avg"
|
||||
}
|
||||
],
|
||||
"rule": "load_variance > core_load_imbalance_threshold_factor",
|
||||
"hint": "CPU load varies by {{printf \"%.1f%%\" (load_variance * 100)}} across cores"
|
||||
}
|
||||
```
|
||||
|
||||
This detects jobs where CPU load is unevenly distributed across cores.
|
||||
|
||||
## Reference
|
||||
|
||||
### Configuration Options
|
||||
|
||||
**Main Configuration (`config.json`)**:
|
||||
|
||||
- `enable-job-taggers` (boolean, default: `false`) - Enables automatic job tagging system
|
||||
- Must be set to `true` to activate automatic tagging on job start/stop events
|
||||
- Does not affect the `-apply-tags` command line option
|
||||
|
||||
**Command Line Options**:
|
||||
|
||||
- `-apply-tags` - Apply all tagging rules to existing jobs in the database
|
||||
- Works independently of `enable-job-taggers` configuration
|
||||
- Useful for retroactively tagging jobs or re-evaluating with updated rules
|
||||
|
||||
### Default Configuration Location
|
||||
|
||||
The example configurations are provided in:
|
||||
|
||||
- `configs/tagger/apps/` - Example application patterns (16 applications)
|
||||
- `configs/tagger/jobclasses/` - Example classification rules (3 rules)
|
||||
|
||||
Copy these to `var/tagger/` and customize for your environment.
|
||||
|
||||
### Tag Types
|
||||
|
||||
- `app` - Application tags (e.g., "vasp", "gromacs")
|
||||
- `jobClass` - Classification tags (e.g., "lowutilization", "highload")
|
||||
|
||||
Tags can be queried and filtered in the ClusterCockpit UI and API.
|
||||
3
configs/tagger/apps/vasp.txt
Normal file
3
configs/tagger/apps/vasp.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
vasp_gam
|
||||
vasp_ncl
|
||||
vasp_std
|
||||
21
configs/tagger/jobclasses/highMemoryUsage.json
Normal file
21
configs/tagger/jobclasses/highMemoryUsage.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"name": "High memory usage",
|
||||
"tag": "highmemory",
|
||||
"parameters": [
|
||||
"highmemoryusage_threshold_factor",
|
||||
"job_min_duration_seconds"
|
||||
],
|
||||
"metrics": ["mem_used"],
|
||||
"requirements": [
|
||||
"job.shared == \"none\"",
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "memory_usage_pct",
|
||||
"expr": "mem_used.max / mem_used.limits.peak * 100.0"
|
||||
}
|
||||
],
|
||||
"rule": "mem_used.max > memory_used.limits.alert",
|
||||
"hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions."
|
||||
}
|
||||
@@ -3,8 +3,7 @@
|
||||
"tag": "excessiveload",
|
||||
"parameters": [
|
||||
"excessivecpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
"job_min_duration_seconds"
|
||||
],
|
||||
"metrics": ["cpu_load"],
|
||||
"requirements": [
|
||||
@@ -15,12 +14,8 @@
|
||||
{
|
||||
"name": "load_threshold",
|
||||
"expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"name": "load_perc",
|
||||
"expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)"
|
||||
}
|
||||
],
|
||||
"rule": "cpu_load.avg > load_threshold",
|
||||
"hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load.avg}} falls above the threshold {{.load_threshold}}."
|
||||
"hint": "This job was detected as having excessive CPU load: average cpu load {{.cpu_load.avg}} exceeds the oversubscription threshold {{.load_threshold}} ({{.excessivecpuload_threshold_factor}} \u00d7 {{.cpu_load.limits.peak}} peak cores), indicating CPU contention."
|
||||
}
|
||||
22
configs/tagger/jobclasses/lowUtilization.json
Normal file
22
configs/tagger/jobclasses/lowUtilization.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "Low resource utilization",
|
||||
"tag": "lowutilization",
|
||||
"parameters": ["job_min_duration_seconds"],
|
||||
"metrics": ["flops_any", "mem_bw"],
|
||||
"requirements": [
|
||||
"job.shared == \"none\"",
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "mem_bw_pct",
|
||||
"expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
|
||||
},
|
||||
{
|
||||
"name": "flops_any_pct",
|
||||
"expr": "flops_any.avg / flops_any.limits.peak * 100.0"
|
||||
}
|
||||
],
|
||||
"rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert",
|
||||
"hint": "This job shows low resource utilization: FLOP rate {{.flops_any.avg}} GF/s ({{.flops_any_pct}}% of peak) and memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) are both below their alert thresholds."
|
||||
}
|
||||
18
configs/tagger/jobclasses/lowload.json
Normal file
18
configs/tagger/jobclasses/lowload.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"name": "Low CPU load",
|
||||
"tag": "lowload",
|
||||
"parameters": ["lowcpuload_threshold_factor", "job_min_duration_seconds"],
|
||||
"metrics": ["cpu_load"],
|
||||
"requirements": [
|
||||
"job.shared == \"none\"",
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "load_threshold",
|
||||
"expr": "cpu_load.limits.peak * lowcpuload_threshold_factor"
|
||||
}
|
||||
],
|
||||
"rule": "cpu_load.avg < load_threshold",
|
||||
"hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}})."
|
||||
}
|
||||
22
configs/tagger/jobclasses/memoryBound.json
Normal file
22
configs/tagger/jobclasses/memoryBound.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "Memory bandwidth bound",
|
||||
"tag": "memorybound",
|
||||
"parameters": ["membound_bw_threshold_factor", "job_min_duration_seconds"],
|
||||
"metrics": ["mem_bw"],
|
||||
"requirements": [
|
||||
"job.shared == \"none\"",
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "mem_bw_threshold",
|
||||
"expr": "mem_bw.limits.peak * membound_bw_threshold_factor"
|
||||
},
|
||||
{
|
||||
"name": "mem_bw_pct",
|
||||
"expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
|
||||
}
|
||||
],
|
||||
"rule": "mem_bw.avg > mem_bw_threshold",
|
||||
"hint": "This job is memory bandwidth bound: memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) is within {{.membound_bw_threshold_factor}} of peak bandwidth. Consider improving data reuse or compute intensity."
|
||||
}
|
||||
@@ -1,11 +1,12 @@
|
||||
{
|
||||
"lowcpuload_threshold_factor": 0.9,
|
||||
"excessivecpuload_threshold_factor": 1.1,
|
||||
"lowcpuload_threshold_factor": 0.85,
|
||||
"excessivecpuload_threshold_factor": 1.2,
|
||||
"highmemoryusage_threshold_factor": 0.9,
|
||||
"node_load_imbalance_threshold_factor": 0.1,
|
||||
"core_load_imbalance_threshold_factor": 0.1,
|
||||
"high_memory_load_threshold_factor": 0.9,
|
||||
"lowgpuload_threshold_factor": 0.7,
|
||||
"membound_bw_threshold_factor": 0.8,
|
||||
"memory_leak_slope_threshold": 0.1,
|
||||
"job_min_duration_seconds": 600.0,
|
||||
"sampling_interval_seconds": 30.0,
|
||||
@@ -1,38 +1,38 @@
|
||||
{
|
||||
"jobList": {
|
||||
"usePaging": false,
|
||||
"showFootprint":false
|
||||
"job-list": {
|
||||
"use-paging": false,
|
||||
"show-footprint":false
|
||||
},
|
||||
"jobView": {
|
||||
"showPolarPlot": true,
|
||||
"showFootprint": true,
|
||||
"showRoofline": true,
|
||||
"showStatTable": true
|
||||
"job-view": {
|
||||
"show-polar-plot": true,
|
||||
"show-footprint": true,
|
||||
"show-roofline": true,
|
||||
"show-stat-table": true
|
||||
},
|
||||
"metricConfig": {
|
||||
"jobListMetrics": ["mem_bw", "flops_dp"],
|
||||
"jobViewPlotMetrics": ["mem_bw", "flops_dp"],
|
||||
"jobViewTableMetrics": ["mem_bw", "flops_dp"],
|
||||
"metric-config": {
|
||||
"job-list-metrics": ["mem_bw", "flops_dp"],
|
||||
"job-view-plot-metrics": ["mem_bw", "flops_dp"],
|
||||
"job-view-table-metrics": ["mem_bw", "flops_dp"],
|
||||
"clusters": [
|
||||
{
|
||||
"name": "test",
|
||||
"subClusters": [
|
||||
"sub-clusters": [
|
||||
{
|
||||
"name": "one",
|
||||
"jobListMetrics": ["mem_used", "flops_sp"]
|
||||
"job-list-metrics": ["mem_used", "flops_sp"]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"nodeList": {
|
||||
"usePaging": true
|
||||
"node-list": {
|
||||
"use-paging": true
|
||||
},
|
||||
"plotConfiguration": {
|
||||
"plotsPerRow": 3,
|
||||
"colorBackground": true,
|
||||
"lineWidth": 3,
|
||||
"colorScheme": [
|
||||
"plot-configuration": {
|
||||
"plots-per-row": 3,
|
||||
"color-background": true,
|
||||
"line-width": 3,
|
||||
"color-scheme": [
|
||||
"#00bfff",
|
||||
"#0000ff",
|
||||
"#ff00ff",
|
||||
|
||||
129
go.mod
129
go.mod
@@ -1,77 +1,75 @@
|
||||
module github.com/ClusterCockpit/cc-backend
|
||||
|
||||
go 1.24.0
|
||||
go 1.25.0
|
||||
|
||||
toolchain go1.24.1
|
||||
tool (
|
||||
github.com/99designs/gqlgen
|
||||
github.com/swaggo/swag/cmd/swag
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/99designs/gqlgen v0.17.84
|
||||
github.com/ClusterCockpit/cc-lib v1.0.0
|
||||
github.com/99designs/gqlgen v0.17.86
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.7.0
|
||||
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0
|
||||
github.com/Masterminds/squirrel v1.5.4
|
||||
github.com/aws/aws-sdk-go-v2 v1.41.0
|
||||
github.com/aws/aws-sdk-go-v2/config v1.31.20
|
||||
github.com/aws/aws-sdk-go-v2/credentials v1.18.24
|
||||
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2
|
||||
github.com/coreos/go-oidc/v3 v3.16.0
|
||||
github.com/expr-lang/expr v1.17.6
|
||||
github.com/go-co-op/gocron/v2 v2.18.2
|
||||
github.com/aws/aws-sdk-go-v2 v1.41.1
|
||||
github.com/aws/aws-sdk-go-v2/config v1.32.8
|
||||
github.com/aws/aws-sdk-go-v2/credentials v1.19.8
|
||||
github.com/aws/aws-sdk-go-v2/service/s3 v1.96.0
|
||||
github.com/coreos/go-oidc/v3 v3.17.0
|
||||
github.com/expr-lang/expr v1.17.8
|
||||
github.com/go-chi/chi/v5 v5.2.5
|
||||
github.com/go-chi/cors v1.2.2
|
||||
github.com/go-co-op/gocron/v2 v2.19.1
|
||||
github.com/go-ldap/ldap/v3 v3.4.12
|
||||
github.com/go-sql-driver/mysql v1.9.3
|
||||
github.com/golang-jwt/jwt/v5 v5.3.0
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1
|
||||
github.com/golang-migrate/migrate/v4 v4.19.1
|
||||
github.com/google/gops v0.3.28
|
||||
github.com/gorilla/handlers v1.5.2
|
||||
github.com/gorilla/mux v1.8.1
|
||||
github.com/google/gops v0.3.29
|
||||
github.com/gorilla/sessions v1.4.0
|
||||
github.com/influxdata/line-protocol/v2 v2.2.1
|
||||
github.com/jmoiron/sqlx v1.4.0
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/linkedin/goavro/v2 v2.14.1
|
||||
github.com/mattn/go-sqlite3 v1.14.32
|
||||
github.com/nats-io/nats.go v1.47.0
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/prometheus/common v0.67.4
|
||||
github.com/mattn/go-sqlite3 v1.14.34
|
||||
github.com/parquet-go/parquet-go v0.27.0
|
||||
github.com/qustavo/sqlhooks/v2 v2.1.0
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
|
||||
github.com/stretchr/testify v1.11.1
|
||||
github.com/swaggo/http-swagger v1.3.4
|
||||
github.com/swaggo/swag v1.16.6
|
||||
github.com/vektah/gqlparser/v2 v2.5.31
|
||||
golang.org/x/crypto v0.45.0
|
||||
golang.org/x/oauth2 v0.32.0
|
||||
golang.org/x/crypto v0.48.0
|
||||
golang.org/x/oauth2 v0.35.0
|
||||
golang.org/x/time v0.14.0
|
||||
)
|
||||
|
||||
require (
|
||||
filippo.io/edwards25519 v1.1.0 // indirect
|
||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
|
||||
github.com/Azure/go-ntlmssp v0.1.0 // indirect
|
||||
github.com/KyleBanks/depth v1.2.1 // indirect
|
||||
github.com/agnivade/levenshtein v1.2.1 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 // indirect
|
||||
github.com/andybalholm/brotli v1.2.0 // indirect
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.17 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.8 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.17 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.14 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 // indirect
|
||||
github.com/aws/smithy-go v1.24.0 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/fsnotify/fsnotify v1.9.0 // indirect
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 // indirect
|
||||
github.com/go-jose/go-jose/v4 v4.1.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.22.3 // indirect
|
||||
github.com/go-openapi/jsonreference v0.21.3 // indirect
|
||||
github.com/go-openapi/spec v0.22.1 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.22.4 // indirect
|
||||
github.com/go-openapi/jsonreference v0.21.4 // indirect
|
||||
github.com/go-openapi/spec v0.22.3 // indirect
|
||||
github.com/go-openapi/swag/conv v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/jsonname v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
|
||||
@@ -79,45 +77,48 @@ require (
|
||||
github.com/go-openapi/swag/stringutils v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/typeutils v0.25.4 // indirect
|
||||
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
|
||||
github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
|
||||
github.com/goccy/go-yaml v1.19.0 // indirect
|
||||
github.com/golang/snappy v0.0.4 // indirect
|
||||
github.com/go-viper/mapstructure/v2 v2.5.0 // indirect
|
||||
github.com/goccy/go-yaml v1.19.2 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/gorilla/securecookie v1.1.2 // indirect
|
||||
github.com/gorilla/websocket v1.5.3 // indirect
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
|
||||
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
|
||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
|
||||
github.com/jonboulle/clockwork v0.5.0 // indirect
|
||||
github.com/jpillora/backoff v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/compress v1.18.1 // indirect
|
||||
github.com/klauspost/compress v1.18.4 // indirect
|
||||
github.com/kr/pretty v0.3.1 // indirect
|
||||
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
|
||||
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
|
||||
github.com/nats-io/nkeys v0.4.11 // indirect
|
||||
github.com/nats-io/nats.go v1.49.0 // indirect
|
||||
github.com/nats-io/nkeys v0.4.15 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/oapi-codegen/runtime v1.2.0 // indirect
|
||||
github.com/parquet-go/bitpack v1.0.0 // indirect
|
||||
github.com/parquet-go/jsonlite v1.4.0 // indirect
|
||||
github.com/pierrec/lz4/v4 v4.1.25 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/procfs v0.16.1 // indirect
|
||||
github.com/robfig/cron/v3 v3.0.1 // indirect
|
||||
github.com/rogpeppe/go-internal v1.10.0 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/sosodev/duration v1.3.1 // indirect
|
||||
github.com/stmcginnis/gofish v0.21.3 // indirect
|
||||
github.com/stretchr/objx v0.5.2 // indirect
|
||||
github.com/swaggo/files v1.0.1 // indirect
|
||||
github.com/twpayne/go-geom v1.6.1 // indirect
|
||||
github.com/urfave/cli/v2 v2.27.7 // indirect
|
||||
github.com/urfave/cli/v3 v3.6.1 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.3 // indirect
|
||||
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
||||
golang.org/x/mod v0.30.0 // indirect
|
||||
golang.org/x/net v0.47.0 // indirect
|
||||
golang.org/x/sync v0.18.0 // indirect
|
||||
golang.org/x/sys v0.38.0 // indirect
|
||||
golang.org/x/text v0.31.0 // indirect
|
||||
golang.org/x/tools v0.39.0 // indirect
|
||||
google.golang.org/protobuf v1.36.10 // indirect
|
||||
golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa // indirect
|
||||
golang.org/x/mod v0.33.0 // indirect
|
||||
golang.org/x/net v0.51.0 // indirect
|
||||
golang.org/x/sync v0.19.0 // indirect
|
||||
golang.org/x/sys v0.41.0 // indirect
|
||||
golang.org/x/text v0.34.0 // indirect
|
||||
golang.org/x/tools v0.42.0 // indirect
|
||||
google.golang.org/protobuf v1.36.11 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
sigs.k8s.io/yaml v1.6.0 // indirect
|
||||
)
|
||||
|
||||
323
go.sum
323
go.sum
@@ -1,81 +1,89 @@
|
||||
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
|
||||
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
|
||||
github.com/99designs/gqlgen v0.17.84 h1:iVMdiStgUVx/BFkMb0J5GAXlqfqtQ7bqMCYK6v52kQ0=
|
||||
github.com/99designs/gqlgen v0.17.84/go.mod h1:qjoUqzTeiejdo+bwUg8unqSpeYG42XrcrQboGIezmFA=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
|
||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8=
|
||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU=
|
||||
github.com/ClusterCockpit/cc-lib v1.0.0 h1:/8DFRomt4BpVWKWrsEZ/ru4K8x76QTVnEgdwHc5eSps=
|
||||
github.com/ClusterCockpit/cc-lib v1.0.0/go.mod h1:UGdOvXEnjFqlnPSxtvtFwO6BtXYW6NnXFoud9FtN93k=
|
||||
github.com/99designs/gqlgen v0.17.86 h1:C8N3UTa5heXX6twl+b0AJyGkTwYL6dNmFrgZNLRcU6w=
|
||||
github.com/99designs/gqlgen v0.17.86/go.mod h1:KTrPl+vHA1IUzNlh4EYkl7+tcErL3MgKnhHrBcV74Fw=
|
||||
github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A=
|
||||
github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.7.0 h1:EMTShk6rMTR1wlfmQ8SVCawH1OdltUbD3kVQmaW+5pE=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.7.0/go.mod h1:0Etx8WMs0lYZ4tiOQizY18CQop+2i3WROvU9rMUxHA4=
|
||||
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q=
|
||||
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0/go.mod h1:y42qUu+YFmu5fdNuUAS4VbbIKxVjxCvbVqFdpdh8ahY=
|
||||
github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
|
||||
github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
|
||||
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
|
||||
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
|
||||
github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM=
|
||||
github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10=
|
||||
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
|
||||
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
|
||||
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
|
||||
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
|
||||
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
|
||||
github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
|
||||
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
|
||||
github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM=
|
||||
github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU=
|
||||
github.com/alecthomas/assert/v2 v2.10.0 h1:jjRCHsj6hBJhkmhznrCzoNpbA3zqy0fYiUcYZP/GkPY=
|
||||
github.com/alecthomas/assert/v2 v2.10.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k=
|
||||
github.com/alecthomas/repr v0.4.0 h1:GhI2A8MACjfegCPVq9f1FLvIBS+DrQ2KQBFZP1iFzXc=
|
||||
github.com/alecthomas/repr v0.4.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4=
|
||||
github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI=
|
||||
github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4=
|
||||
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ=
|
||||
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
|
||||
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
|
||||
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
|
||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
|
||||
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM=
|
||||
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
|
||||
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
|
||||
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
|
||||
github.com/aws/aws-sdk-go-v2 v1.41.0 h1:tNvqh1s+v0vFYdA1xq0aOJH+Y5cRyZ5upu6roPgPKd4=
|
||||
github.com/aws/aws-sdk-go-v2 v1.41.0/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0=
|
||||
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 h1:DHctwEM8P8iTXFxC/QK0MRjwEpWQeM9yzidCRjldUz0=
|
||||
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3/go.mod h1:xdCzcZEtnSTKVDOmUZs4l/j3pSV6rpo1WXl5ugNsL8Y=
|
||||
github.com/aws/aws-sdk-go-v2/config v1.31.20 h1:/jWF4Wu90EhKCgjTdy1DGxcbcbNrjfBHvksEL79tfQc=
|
||||
github.com/aws/aws-sdk-go-v2/config v1.31.20/go.mod h1:95Hh1Tc5VYKL9NJ7tAkDcqeKt+MCXQB1hQZaRdJIZE0=
|
||||
github.com/aws/aws-sdk-go-v2/credentials v1.18.24 h1:iJ2FmPT35EaIB0+kMa6TnQ+PwG5A1prEdAw+PsMzfHg=
|
||||
github.com/aws/aws-sdk-go-v2/credentials v1.18.24/go.mod h1:U91+DrfjAiXPDEGYhh/x29o4p0qHX5HDqG7y5VViv64=
|
||||
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 h1:T1brd5dR3/fzNFAQch/iBKeX07/ffu/cLu+q+RuzEWk=
|
||||
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13/go.mod h1:Peg/GBAQ6JDt+RoBf4meB1wylmAipb7Kg2ZFakZTlwk=
|
||||
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 h1:a+8/MLcWlIxo1lF9xaGt3J/u3yOZx+CdSveSNwjhD40=
|
||||
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13/go.mod h1:oGnKwIYZ4XttyU2JWxFrwvhF6YKiK/9/wmE3v3Iu9K8=
|
||||
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 h1:HBSI2kDkMdWz4ZM7FjwE7e/pWDEZ+nR95x8Ztet1ooY=
|
||||
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13/go.mod h1:YE94ZoDArI7awZqJzBAZ3PDD2zSfuP7w6P2knOzIn8M=
|
||||
github.com/aws/aws-sdk-go-v2 v1.41.1 h1:ABlyEARCDLN034NhxlRUSZr4l71mh+T5KAeGh6cerhU=
|
||||
github.com/aws/aws-sdk-go-v2 v1.41.1/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0=
|
||||
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 h1:489krEF9xIGkOaaX3CE/Be2uWjiXrkCH6gUX+bZA/BU=
|
||||
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4/go.mod h1:IOAPF6oT9KCsceNTvvYMNHy0+kMF8akOjeDvPENWxp4=
|
||||
github.com/aws/aws-sdk-go-v2/config v1.32.8 h1:iu+64gwDKEoKnyTQskSku72dAwggKI5sV6rNvgSMpMs=
|
||||
github.com/aws/aws-sdk-go-v2/config v1.32.8/go.mod h1:MI2XvA+qDi3i9AJxX1E2fu730syEBzp/jnXrjxuHwgI=
|
||||
github.com/aws/aws-sdk-go-v2/credentials v1.19.8 h1:Jp2JYH1lRT3KhX4mshHPvVYsR5qqRec3hGvEarNYoR0=
|
||||
github.com/aws/aws-sdk-go-v2/credentials v1.19.8/go.mod h1:fZG9tuvyVfxknv1rKibIz3DobRaFw1Poe8IKtXB3XYY=
|
||||
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 h1:I0GyV8wiYrP8XpA70g1HBcQO1JlQxCMTW9npl5UbDHY=
|
||||
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17/go.mod h1:tyw7BOl5bBe/oqvoIeECFJjMdzXoa/dfVz3QQ5lgHGA=
|
||||
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 h1:xOLELNKGp2vsiteLsvLPwxC+mYmO6OZ8PYgiuPJzF8U=
|
||||
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17/go.mod h1:5M5CI3D12dNOtH3/mk6minaRwI2/37ifCURZISxA/IQ=
|
||||
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 h1:WWLqlh79iO48yLkj1v3ISRNiv+3KdQoZ6JWyfcsyQik=
|
||||
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17/go.mod h1:EhG22vHRrvF8oXSTYStZhJc1aUgKtnJe+aOiFEV90cM=
|
||||
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk=
|
||||
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc=
|
||||
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 h1:eg/WYAa12vqTphzIdWMzqYRVKKnCboVPRlvaybNCqPA=
|
||||
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13/go.mod h1:/FDdxWhz1486obGrKKC1HONd7krpk38LBt+dutLcN9k=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 h1:x2Ibm/Af8Fi+BH+Hsn9TXGdT+hKbDd5XOTZxTMxDk7o=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3/go.mod h1:IW1jwyrQgMdhisceG8fQLmQIydcT/jWY21rFhzgaKwo=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 h1:NvMjwvv8hpGUILarKw7Z4Q0w1H9anXKsesMxtw++MA4=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4/go.mod h1:455WPHSwaGj2waRSpQp7TsnpOnBfw8iDfPfbwl7KPJE=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 h1:kDqdFvMY4AtKoACfzIGD8A0+hbT41KTKF//gq7jITfM=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13/go.mod h1:lmKuogqSU3HzQCwZ9ZtcqOc5XGMqtDK7OIc2+DxiUEg=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 h1:zhBJXdhWIFZ1acfDYIhu4+LCzdUS2Vbcum7D01dXlHQ=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13/go.mod h1:JaaOeCE368qn2Hzi3sEzY6FgAZVCIYcC2nwbro2QCh8=
|
||||
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2 h1:DhdbtDl4FdNlj31+xiRXANxEE+eC7n8JQz+/ilwQ8Uc=
|
||||
github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2/go.mod h1:+wArOOrcHUevqdto9k1tKOF5++YTe9JEcPSc9Tx2ZSw=
|
||||
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 h1:NjShtS1t8r5LUfFVtFeI8xLAHQNTa7UI0VawXlrBMFQ=
|
||||
github.com/aws/aws-sdk-go-v2/service/sso v1.30.3/go.mod h1:fKvyjJcz63iL/ftA6RaM8sRCtN4r4zl4tjL3qw5ec7k=
|
||||
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 h1:gTsnx0xXNQ6SBbymoDvcoRHL+q4l/dAFsQuKfDWSaGc=
|
||||
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7/go.mod h1:klO+ejMvYsB4QATfEOIXk8WAEwN4N0aBfJpvC+5SZBo=
|
||||
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 h1:HK5ON3KmQV2HcAunnx4sKLB9aPf3gKGwVAf7xnx0QT0=
|
||||
github.com/aws/aws-sdk-go-v2/service/sts v1.40.2/go.mod h1:E19xDjpzPZC7LS2knI9E6BaRFDK43Eul7vd6rSq2HWk=
|
||||
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.17 h1:JqcdRG//czea7Ppjb+g/n4o8i/R50aTBHkA7vu0lK+k=
|
||||
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.17/go.mod h1:CO+WeGmIdj/MlPel2KwID9Gt7CNq4M65HUfBW97liM0=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 h1:0ryTNEdJbzUCEWkVXEXoqlXV72J5keC1GvILMOuD00E=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4/go.mod h1:HQ4qwNZh32C3CBeO6iJLQlgtMzqeG17ziAA/3KDJFow=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.8 h1:Z5EiPIzXKewUQK0QTMkutjiaPVeVYXX7KIqhXu/0fXs=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.8/go.mod h1:FsTpJtvC4U1fyDXk7c71XoDv3HlRm8V3NiYLeYLh5YE=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 h1:RuNSMoozM8oXlgLG/n6WLaFGoea7/CddrCfIiSA+xdY=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17/go.mod h1:F2xxQ9TZz5gDWsclCtPQscGpP0VUOc8RqgFM3vDENmU=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.17 h1:bGeHBsGZx0Dvu/eJC0Lh9adJa3M1xREcndxLNZlve2U=
|
||||
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.17/go.mod h1:dcW24lbU0CzHusTE8LLHhRLI42ejmINN8Lcr22bwh/g=
|
||||
github.com/aws/aws-sdk-go-v2/service/s3 v1.96.0 h1:oeu8VPlOre74lBA/PMhxa5vewaMIMmILM+RraSyB8KA=
|
||||
github.com/aws/aws-sdk-go-v2/service/s3 v1.96.0/go.mod h1:5jggDlZ2CLQhwJBiZJb4vfk4f0GxWdEDruWKEJ1xOdo=
|
||||
github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 h1:VrhDvQib/i0lxvr3zqlUwLwJP4fpmpyD9wYG1vfSu+Y=
|
||||
github.com/aws/aws-sdk-go-v2/service/signin v1.0.5/go.mod h1:k029+U8SY30/3/ras4G/Fnv/b88N4mAfliNn08Dem4M=
|
||||
github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 h1:v6EiMvhEYBoHABfbGB4alOYmCIrcgyPPiBE1wZAEbqk=
|
||||
github.com/aws/aws-sdk-go-v2/service/sso v1.30.9/go.mod h1:yifAsgBxgJWn3ggx70A3urX2AN49Y5sJTD1UQFlfqBw=
|
||||
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.14 h1:0jbJeuEHlwKJ9PfXtpSFc4MF+WIWORdhN1n30ITZGFM=
|
||||
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.14/go.mod h1:sTGThjphYE4Ohw8vJiRStAcu3rbjtXRsdNB0TvZ5wwo=
|
||||
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 h1:5fFjR/ToSOzB2OQ/XqWpZBmNvmP/pJ1jOWYlFDJTjRQ=
|
||||
github.com/aws/aws-sdk-go-v2/service/sts v1.41.6/go.mod h1:qgFDZQSD/Kys7nJnVqYlWKnh0SSdMjAi0uSwON4wgYQ=
|
||||
github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk=
|
||||
github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
|
||||
github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
|
||||
github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
|
||||
github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk=
|
||||
github.com/coreos/go-oidc/v3 v3.16.0 h1:qRQUCFstKpXwmEjDQTIbyY/5jF00+asXzSkmkoa/mow=
|
||||
github.com/coreos/go-oidc/v3 v3.16.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8=
|
||||
github.com/coreos/go-oidc/v3 v3.17.0 h1:hWBGaQfbi0iVviX4ibC7bk8OKT5qNr4klBaCHVNvehc=
|
||||
github.com/coreos/go-oidc/v3 v3.17.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
@@ -85,44 +93,30 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo=
|
||||
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
|
||||
github.com/dhui/dktest v0.4.6 h1:+DPKyScKSEp3VLtbMDHcUq6V5Lm5zfZZVb0Sk7Ahom4=
|
||||
github.com/dhui/dktest v0.4.6/go.mod h1:JHTSYDtKkvFNFHJKqCzVzqXecyv+tKt8EzceOmQOgbU=
|
||||
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
|
||||
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
|
||||
github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI=
|
||||
github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
|
||||
github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c=
|
||||
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
|
||||
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
|
||||
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/expr-lang/expr v1.17.6 h1:1h6i8ONk9cexhDmowO/A64VPxHScu7qfSl2k8OlINec=
|
||||
github.com/expr-lang/expr v1.17.6/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
|
||||
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
|
||||
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
|
||||
github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM=
|
||||
github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
||||
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
|
||||
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
|
||||
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
|
||||
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 h1:BP4M0CvQ4S3TGls2FvczZtj5Re/2ZzkV9VwqPHH/3Bo=
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0=
|
||||
github.com/go-co-op/gocron/v2 v2.18.2 h1:+5VU41FUXPWSPKLXZQ/77SGzUiPCcakU0v7ENc2H20Q=
|
||||
github.com/go-co-op/gocron/v2 v2.18.2/go.mod h1:Zii6he+Zfgy5W9B+JKk/KwejFOW0kZTFvHtwIpR4aBI=
|
||||
github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug=
|
||||
github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0=
|
||||
github.com/go-chi/cors v1.2.2 h1:Jmey33TE+b+rB7fT8MUy1u0I4L+NARQlK6LhzKPSyQE=
|
||||
github.com/go-chi/cors v1.2.2/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn836hqM7JxpglAy2Vzc58=
|
||||
github.com/go-co-op/gocron/v2 v2.19.1 h1:B4iLeA0NB/2iO3EKQ7NfKn5KsQgZfjb2fkvoZJU3yBI=
|
||||
github.com/go-co-op/gocron/v2 v2.19.1/go.mod h1:5lEiCKk1oVJV39Zg7/YG10OnaVrDAV5GGR6O0663k6U=
|
||||
github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs=
|
||||
github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08=
|
||||
github.com/go-ldap/ldap/v3 v3.4.12 h1:1b81mv7MagXZ7+1r7cLTWmyuTqVqdwbtJSjC0DAp9s4=
|
||||
github.com/go-ldap/ldap/v3 v3.4.12/go.mod h1:+SPAGcTtOfmGsCb3h1RFiq4xpp4N636G75OEace8lNo=
|
||||
github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
|
||||
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
|
||||
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
|
||||
github.com/go-openapi/jsonpointer v0.22.3 h1:dKMwfV4fmt6Ah90zloTbUKWMD+0he+12XYAsPotrkn8=
|
||||
github.com/go-openapi/jsonpointer v0.22.3/go.mod h1:0lBbqeRsQ5lIanv3LHZBrmRGHLHcQoOXQnf88fHlGWo=
|
||||
github.com/go-openapi/jsonreference v0.21.3 h1:96Dn+MRPa0nYAR8DR1E03SblB5FJvh7W6krPI0Z7qMc=
|
||||
github.com/go-openapi/jsonreference v0.21.3/go.mod h1:RqkUP0MrLf37HqxZxrIAtTWW4ZJIK1VzduhXYBEeGc4=
|
||||
github.com/go-openapi/spec v0.22.1 h1:beZMa5AVQzRspNjvhe5aG1/XyBSMeX1eEOs7dMoXh/k=
|
||||
github.com/go-openapi/spec v0.22.1/go.mod h1:c7aeIQT175dVowfp7FeCvXXnjN/MrpaONStibD2WtDA=
|
||||
github.com/go-openapi/jsonpointer v0.22.4 h1:dZtK82WlNpVLDW2jlA1YCiVJFVqkED1MegOUy9kR5T4=
|
||||
github.com/go-openapi/jsonpointer v0.22.4/go.mod h1:elX9+UgznpFhgBuaMQ7iu4lvvX1nvNsesQ3oxmYTw80=
|
||||
github.com/go-openapi/jsonreference v0.21.4 h1:24qaE2y9bx/q3uRK/qN+TDwbok1NhbSmGjjySRCHtC8=
|
||||
github.com/go-openapi/jsonreference v0.21.4/go.mod h1:rIENPTjDbLpzQmQWCj5kKj3ZlmEh+EFVbz3RTUh30/4=
|
||||
github.com/go-openapi/spec v0.22.3 h1:qRSmj6Smz2rEBxMnLRBMeBWxbbOvuOoElvSvObIgwQc=
|
||||
github.com/go-openapi/spec v0.22.3/go.mod h1:iIImLODL2loCh3Vnox8TY2YWYJZjMAKYyLH2Mu8lOZs=
|
||||
github.com/go-openapi/swag v0.19.15 h1:D2NRCBzS9/pEY3gP9Nl8aDqGUcPFrwG2p+CNFrLyrCM=
|
||||
github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4=
|
||||
github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU=
|
||||
@@ -145,35 +139,28 @@ github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8U
|
||||
github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls=
|
||||
github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54=
|
||||
github.com/go-sql-driver/mysql v1.4.1/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
|
||||
github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y=
|
||||
github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
|
||||
github.com/go-sql-driver/mysql v1.9.3 h1:U/N249h2WzJ3Ukj8SowVFjdtZKfu9vlLZxjPXV1aweo=
|
||||
github.com/go-sql-driver/mysql v1.9.3/go.mod h1:qn46aNg1333BRMNU69Lq93t8du/dwxI64Gl8i5p1WMU=
|
||||
github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs=
|
||||
github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
|
||||
github.com/goccy/go-yaml v1.19.0 h1:EmkZ9RIsX+Uq4DYFowegAuJo8+xdX3T/2dwNPXbxEYE=
|
||||
github.com/goccy/go-yaml v1.19.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA=
|
||||
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
|
||||
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
|
||||
github.com/go-viper/mapstructure/v2 v2.5.0 h1:vM5IJoUAy3d7zRSVtIwQgBj7BiWtMPfmPEgAXnvj1Ro=
|
||||
github.com/go-viper/mapstructure/v2 v2.5.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
|
||||
github.com/goccy/go-yaml v1.19.2 h1:PmFC1S6h8ljIz6gMRBopkjP1TVT7xuwrButHID66PoM=
|
||||
github.com/goccy/go-yaml v1.19.2/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
|
||||
github.com/golang-migrate/migrate/v4 v4.19.1 h1:OCyb44lFuQfYXYLx1SCxPZQGU7mcaZ7gH9yH4jSFbBA=
|
||||
github.com/golang-migrate/migrate/v4 v4.19.1/go.mod h1:CTcgfjxhaUtsLipnLoQRWCrjYXycRz/g5+RWDuYgPrE=
|
||||
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
|
||||
github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
|
||||
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
|
||||
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
|
||||
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark=
|
||||
github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c=
|
||||
github.com/google/gops v0.3.29 h1:n98J2qSOK1NJvRjdLDcjgDryjpIBGhbaqph1mXKL0rY=
|
||||
github.com/google/gops v0.3.29/go.mod h1:8N3jZftuPazvUwtYY/ncG4iPrjp15ysNKLfq+QQPiwc=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE=
|
||||
github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w=
|
||||
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
|
||||
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
|
||||
github.com/gorilla/securecookie v1.1.2 h1:YCIWL56dvtr73r6715mJs5ZvhtnY73hBvEF8kXD8ePA=
|
||||
@@ -186,17 +173,14 @@ github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/C
|
||||
github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
|
||||
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
|
||||
github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjwJdUHnwvfjMF71M1iI4=
|
||||
github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI=
|
||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU=
|
||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
|
||||
github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98=
|
||||
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
|
||||
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
|
||||
github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY=
|
||||
github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY=
|
||||
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
|
||||
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
|
||||
github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8=
|
||||
github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs=
|
||||
github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo=
|
||||
@@ -215,17 +199,11 @@ github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/jonboulle/clockwork v0.5.0 h1:Hyh9A8u51kptdkR+cqRpT1EebBwTn1oK9YfGYbdFz6I=
|
||||
github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7XN3SzBPjZF60=
|
||||
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
|
||||
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
|
||||
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
|
||||
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
|
||||
github.com/klauspost/compress v1.18.1 h1:bcSGx7UbpBqMChDtsF28Lw6v/G94LPrrbMbdC3JH2co=
|
||||
github.com/klauspost/compress v1.18.1/go.mod h1:ZQFFVG+MdnR0P+l6wpXgIL4NTtwiKIdBnrBd8Nrxr+0=
|
||||
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
|
||||
github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
|
||||
github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw=
|
||||
@@ -235,43 +213,36 @@ github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6Fm
|
||||
github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
|
||||
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
|
||||
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
||||
github.com/linkedin/goavro/v2 v2.14.1 h1:/8VjDpd38PRsy02JS0jflAu7JZPfJcGTwqWgMkFS2iI=
|
||||
github.com/linkedin/goavro/v2 v2.14.1/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk=
|
||||
github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
|
||||
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
|
||||
github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
|
||||
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
|
||||
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
|
||||
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
|
||||
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
|
||||
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
|
||||
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
|
||||
github.com/mattn/go-sqlite3 v1.14.34 h1:3NtcvcUnFBPsuRcno8pUtupspG/GM+9nZ88zgJcp6Zk=
|
||||
github.com/mattn/go-sqlite3 v1.14.34/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
|
||||
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
|
||||
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
|
||||
github.com/nats-io/nats.go v1.47.0 h1:YQdADw6J/UfGUd2Oy6tn4Hq6YHxCaJrVKayxxFqYrgM=
|
||||
github.com/nats-io/nats.go v1.47.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
|
||||
github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0=
|
||||
github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE=
|
||||
github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
|
||||
github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
|
||||
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
|
||||
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
|
||||
github.com/nats-io/nats.go v1.49.0 h1:yh/WvY59gXqYpgl33ZI+XoVPKyut/IcEaqtsiuTJpoE=
|
||||
github.com/nats-io/nats.go v1.49.0/go.mod h1:fDCn3mN5cY8HooHwE2ukiLb4p4G4ImmzvXyJt+tGwdw=
|
||||
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
|
||||
github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs=
|
||||
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
||||
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
||||
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
|
||||
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
|
||||
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
|
||||
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
|
||||
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
|
||||
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
|
||||
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
|
||||
github.com/oapi-codegen/runtime v1.2.0 h1:RvKc1CVS1QeKSNzO97FBQbSMZyQ8s6rZd+LpmzwHMP4=
|
||||
github.com/oapi-codegen/runtime v1.2.0/go.mod h1:Y7ZhmmlE8ikZOmuHRRndiIm7nf3xcVv+YMweKgG1DT0=
|
||||
github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA=
|
||||
github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs=
|
||||
github.com/parquet-go/jsonlite v1.4.0 h1:RTG7prqfO0HD5egejU8MUDBN8oToMj55cgSV1I0zNW4=
|
||||
github.com/parquet-go/jsonlite v1.4.0/go.mod h1:nDjpkpL4EOtqs6NQugUsi0Rleq9sW/OtC1NnZEnxzF0=
|
||||
github.com/parquet-go/parquet-go v0.27.0 h1:vHWK2xaHbj+v1DYps03yDRpEsdtOeKbhiXUaixoPb3g=
|
||||
github.com/parquet-go/parquet-go v0.27.0/go.mod h1:navtkAYr2LGoJVp141oXPlO/sxLvaOe3la2JEoD8+rg=
|
||||
github.com/pierrec/lz4/v4 v4.1.25 h1:kocOqRffaIbU5djlIBr7Wh+cx82C0vtFb0fOurZHqD0=
|
||||
github.com/pierrec/lz4/v4 v4.1.25/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4=
|
||||
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
@@ -279,16 +250,17 @@ github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h
|
||||
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||
github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc=
|
||||
github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI=
|
||||
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
|
||||
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
|
||||
github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4=
|
||||
github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw=
|
||||
github.com/prometheus/procfs v0.20.0 h1:AA7aCvjxwAquZAlonN7888f2u4IN8WVeFgBi4k82M4Q=
|
||||
github.com/prometheus/procfs v0.20.0/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo=
|
||||
github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAiCZi0=
|
||||
github.com/qustavo/sqlhooks/v2 v2.1.0/go.mod h1:aMREyKo7fOKTwiLuWPsaHRXEmtqG4yREztO0idF83AU=
|
||||
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
|
||||
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
|
||||
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
|
||||
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
|
||||
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
||||
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
|
||||
@@ -297,15 +269,15 @@ github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
|
||||
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
|
||||
github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4=
|
||||
github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg=
|
||||
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
|
||||
github.com/stmcginnis/gofish v0.21.3 h1:EBLCHfORnbx7MPw7lplOOVe9QAD1T3XRVz6+a1Z4z5Q=
|
||||
github.com/stmcginnis/gofish v0.21.3/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
|
||||
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE=
|
||||
@@ -314,6 +286,8 @@ github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64
|
||||
github.com/swaggo/http-swagger v1.3.4/go.mod h1:9dAh0unqMBAlbp1uE2Uc2mQTxNMU/ha4UbucIg1MFkQ=
|
||||
github.com/swaggo/swag v1.16.6 h1:qBNcx53ZaX+M5dxVyTrgQ0PJ/ACK+NzhwcbieTt+9yI=
|
||||
github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4Xesg=
|
||||
github.com/twpayne/go-geom v1.6.1 h1:iLE+Opv0Ihm/ABIcvQFGIiFBXd76oBIar9drAwHFhR4=
|
||||
github.com/twpayne/go-geom v1.6.1/go.mod h1:Kr+Nly6BswFsKM5sd31YaoWS5PeDDH2NftJTK7Gd028=
|
||||
github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU=
|
||||
github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4=
|
||||
github.com/urfave/cli/v3 v3.6.1 h1:j8Qq8NyUawj/7rTYdBGrxcH7A/j7/G8Q5LhWEW4G3Mo=
|
||||
@@ -322,17 +296,9 @@ github.com/vektah/gqlparser/v2 v2.5.31 h1:YhWGA1mfTjID7qJhd1+Vxhpk5HTgydrGU9IgkW
|
||||
github.com/vektah/gqlparser/v2 v2.5.31/go.mod h1:c1I28gSOVNzlfc4WuDlqU7voQnsqI6OG2amkBAFmgts=
|
||||
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg=
|
||||
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
|
||||
github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
|
||||
github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
|
||||
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q=
|
||||
go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ=
|
||||
go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I=
|
||||
go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE=
|
||||
go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E=
|
||||
go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4=
|
||||
go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
|
||||
@@ -341,33 +307,33 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
|
||||
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q=
|
||||
golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4=
|
||||
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o=
|
||||
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
|
||||
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
|
||||
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
|
||||
golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa h1:Zt3DZoOFFYkKhDT3v7Lm9FDMEV06GpzjG2jrqW+QTE0=
|
||||
golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
|
||||
golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc=
|
||||
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
|
||||
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
|
||||
golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY=
|
||||
golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
|
||||
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
|
||||
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
|
||||
golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
|
||||
golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I=
|
||||
golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
|
||||
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
|
||||
golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
|
||||
golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
@@ -375,26 +341,23 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
|
||||
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
|
||||
golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
|
||||
golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
|
||||
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
||||
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ=
|
||||
golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ=
|
||||
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
|
||||
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
|
||||
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
|
||||
|
||||
52
gqlgen.yml
52
gqlgen.yml
@@ -52,51 +52,51 @@ models:
|
||||
- github.com/99designs/gqlgen/graphql.Int64
|
||||
- github.com/99designs/gqlgen/graphql.Int32
|
||||
Job:
|
||||
model: "github.com/ClusterCockpit/cc-lib/schema.Job"
|
||||
model: "github.com/ClusterCockpit/cc-lib/v2/schema.Job"
|
||||
fields:
|
||||
tags:
|
||||
resolver: true
|
||||
metaData:
|
||||
resolver: true
|
||||
Cluster:
|
||||
model: "github.com/ClusterCockpit/cc-lib/schema.Cluster"
|
||||
model: "github.com/ClusterCockpit/cc-lib/v2/schema.Cluster"
|
||||
fields:
|
||||
partitions:
|
||||
resolver: true
|
||||
# Node:
|
||||
# model: "github.com/ClusterCockpit/cc-lib/schema.Node"
|
||||
# model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node"
|
||||
# fields:
|
||||
# metaData:
|
||||
# resolver: true
|
||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/schema.Float" }
|
||||
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricScope" }
|
||||
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricValue" }
|
||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Float" }
|
||||
MetricScope: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricScope" }
|
||||
MetricValue: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricValue" }
|
||||
JobStatistics:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.JobStatistics" }
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobStatistics" }
|
||||
GlobalMetricListItem:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.GlobalMetricListItem" }
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.GlobalMetricListItem" }
|
||||
ClusterSupport:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.ClusterSupport" }
|
||||
Tag: { model: "github.com/ClusterCockpit/cc-lib/schema.Tag" }
|
||||
Resource: { model: "github.com/ClusterCockpit/cc-lib/schema.Resource" }
|
||||
JobState: { model: "github.com/ClusterCockpit/cc-lib/schema.JobState" }
|
||||
Node: { model: "github.com/ClusterCockpit/cc-lib/schema.Node" }
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.ClusterSupport" }
|
||||
Tag: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Tag" }
|
||||
Resource: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Resource" }
|
||||
JobState: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobState" }
|
||||
Node: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node" }
|
||||
SchedulerState:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.SchedulerState" }
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.SchedulerState" }
|
||||
HealthState:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.MonitoringState" }
|
||||
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/schema.JobMetric" }
|
||||
Series: { model: "github.com/ClusterCockpit/cc-lib/schema.Series" }
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MonitoringState" }
|
||||
JobMetric: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobMetric" }
|
||||
Series: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Series" }
|
||||
MetricStatistics:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricStatistics" }
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricStatistics" }
|
||||
MetricConfig:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.MetricConfig" }
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricConfig" }
|
||||
SubClusterConfig:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.SubClusterConfig" }
|
||||
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/schema.Accelerator" }
|
||||
Topology: { model: "github.com/ClusterCockpit/cc-lib/schema.Topology" }
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubClusterConfig" }
|
||||
Accelerator: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Accelerator" }
|
||||
Topology: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Topology" }
|
||||
FilterRanges:
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/schema.FilterRanges" }
|
||||
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/schema.SubCluster" }
|
||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/schema.StatsSeries" }
|
||||
Unit: { model: "github.com/ClusterCockpit/cc-lib/schema.Unit" }
|
||||
{ model: "github.com/ClusterCockpit/cc-lib/v2/schema.FilterRanges" }
|
||||
SubCluster: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubCluster" }
|
||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.StatsSeries" }
|
||||
Unit: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Unit" }
|
||||
|
||||
@@ -3,7 +3,7 @@ Description=ClusterCockpit Web Server
|
||||
Documentation=https://github.com/ClusterCockpit/cc-backend
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
After=mariadb.service mysql.service
|
||||
# Database is file-based SQLite - no service dependency required
|
||||
|
||||
[Service]
|
||||
WorkingDirectory=/opt/monitoring/cc-backend
|
||||
@@ -12,7 +12,7 @@ NotifyAccess=all
|
||||
Restart=on-failure
|
||||
RestartSec=30
|
||||
TimeoutStopSec=100
|
||||
ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json
|
||||
ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json --server
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@@ -23,47 +23,45 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
func setup(t *testing.T) *api.RestAPI {
|
||||
repository.ResetConnection()
|
||||
|
||||
const testconfig = `{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"api-allowed-ips": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"metric-store": {
|
||||
"checkpoints": {
|
||||
"interval": "12h"
|
||||
},
|
||||
"retention-in-memory": "48h",
|
||||
"memory-cap": 100
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
}
|
||||
}
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "testcluster",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 64 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
||||
}
|
||||
}
|
||||
]
|
||||
}`
|
||||
const testclusterJSON = `{
|
||||
"name": "testcluster",
|
||||
@@ -141,7 +139,7 @@ func setup(t *testing.T) *api.RestAPI {
|
||||
}
|
||||
|
||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||
err := repository.MigrateDB("sqlite3", dbfilepath)
|
||||
err := repository.MigrateDB(dbfilepath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -152,28 +150,23 @@ func setup(t *testing.T) *api.RestAPI {
|
||||
}
|
||||
|
||||
ccconf.Init(cfgFilePath)
|
||||
metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{}
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
config.Init(cfg)
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
repository.Connect("sqlite3", dbfilepath)
|
||||
repository.Connect(dbfilepath)
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := metricdata.Init(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// metricstore initialization removed - it's initialized via callback in tests
|
||||
|
||||
archiver.Start(repository.GetJobRepository(), context.Background())
|
||||
|
||||
@@ -190,11 +183,9 @@ func setup(t *testing.T) *api.RestAPI {
|
||||
}
|
||||
|
||||
func cleanup() {
|
||||
// Gracefully shutdown archiver with timeout
|
||||
if err := archiver.Shutdown(5 * time.Second); err != nil {
|
||||
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
|
||||
}
|
||||
// TODO: Clear all caches, reset all modules, etc...
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -221,16 +212,14 @@ func TestRestApi(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
return testData, nil
|
||||
}
|
||||
|
||||
r := mux.NewRouter()
|
||||
r.PathPrefix("/api").Subrouter()
|
||||
r.StrictSlash(true)
|
||||
r := chi.NewRouter()
|
||||
restapi.MountAPIRoutes(r)
|
||||
|
||||
var TestJobId int64 = 123
|
||||
var TestJobID int64 = 123
|
||||
TestClusterName := "testcluster"
|
||||
var TestStartTime int64 = 123456789
|
||||
|
||||
@@ -280,7 +269,7 @@ func TestRestApi(t *testing.T) {
|
||||
}
|
||||
// resolver := graph.GetResolverInstance()
|
||||
restapi.JobRepository.SyncJobs()
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -338,7 +327,7 @@ func TestRestApi(t *testing.T) {
|
||||
}
|
||||
|
||||
// Archiving happens asynchronously, will be completed in cleanup
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -366,7 +355,7 @@ func TestRestApi(t *testing.T) {
|
||||
}
|
||||
|
||||
t.Run("CheckArchive", func(t *testing.T) {
|
||||
data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
|
||||
data, err := metricdispatch.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -464,4 +453,198 @@ func TestRestApi(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatal("subtest failed")
|
||||
}
|
||||
|
||||
t.Run("GetUsedNodesNoRunning", func(t *testing.T) {
|
||||
contextUserValue := &schema.User{
|
||||
Username: "testuser",
|
||||
Projects: make([]string, 0),
|
||||
Roles: []string{"api"},
|
||||
AuthType: 0,
|
||||
AuthSource: 2,
|
||||
}
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/jobs/used_nodes?ts=123456790", nil)
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusOK {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
var result api.GetUsedNodesAPIResponse
|
||||
if err := json.NewDecoder(response.Body).Decode(&result); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if result.UsedNodes == nil {
|
||||
t.Fatal("expected usedNodes to be non-nil")
|
||||
}
|
||||
|
||||
if len(result.UsedNodes) != 0 {
|
||||
t.Fatalf("expected no used nodes for stopped jobs, got: %v", result.UsedNodes)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestStopJobWithReusedJobId verifies that stopping a recently started job works
|
||||
// even when an older job with the same jobId exists in the job table (e.g. with
|
||||
// state "failed"). This is a regression test for the bug where Find() on the job
|
||||
// table would match the old job instead of the new one still in job_cache.
|
||||
func TestStopJobWithReusedJobId(t *testing.T) {
|
||||
restapi := setup(t)
|
||||
t.Cleanup(cleanup)
|
||||
|
||||
testData := schema.JobData{
|
||||
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||
schema.MetricScopeNode: {
|
||||
Unit: schema.Unit{Base: "load"},
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "host123",
|
||||
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
|
||||
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
return testData, nil
|
||||
}
|
||||
|
||||
r := chi.NewRouter()
|
||||
restapi.MountAPIRoutes(r)
|
||||
|
||||
const contextUserKey repository.ContextKey = "user"
|
||||
contextUserValue := &schema.User{
|
||||
Username: "testuser",
|
||||
Projects: make([]string, 0),
|
||||
Roles: []string{"user"},
|
||||
AuthType: 0,
|
||||
AuthSource: 2,
|
||||
}
|
||||
|
||||
// Step 1: Start the first job (jobId=999)
|
||||
const startJobBody1 string = `{
|
||||
"jobId": 999,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "default",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
|
||||
"startTime": 200000000
|
||||
}`
|
||||
|
||||
if ok := t.Run("StartFirstJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody1)))
|
||||
recorder := httptest.NewRecorder()
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
if recorder.Result().StatusCode != http.StatusCreated {
|
||||
t.Fatal(recorder.Result().Status, recorder.Body.String())
|
||||
}
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
|
||||
// Step 2: Sync to move job from cache to job table, then stop it as "failed"
|
||||
time.Sleep(1 * time.Second)
|
||||
restapi.JobRepository.SyncJobs()
|
||||
|
||||
const stopJobBody1 string = `{
|
||||
"jobId": 999,
|
||||
"startTime": 200000000,
|
||||
"cluster": "testcluster",
|
||||
"jobState": "failed",
|
||||
"stopTime": 200001000
|
||||
}`
|
||||
|
||||
if ok := t.Run("StopFirstJobAsFailed", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody1)))
|
||||
recorder := httptest.NewRecorder()
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
if recorder.Result().StatusCode != http.StatusOK {
|
||||
t.Fatal(recorder.Result().Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
jobid, cluster := int64(999), "testcluster"
|
||||
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if job.State != schema.JobStateFailed {
|
||||
t.Fatalf("expected first job to be failed, got: %s", job.State)
|
||||
}
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
|
||||
// Wait for archiving to complete
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
// Step 3: Start a NEW job with the same jobId=999 but different startTime.
|
||||
// This job will sit in job_cache (not yet synced).
|
||||
const startJobBody2 string = `{
|
||||
"jobId": 999,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "default",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
|
||||
"startTime": 300000000
|
||||
}`
|
||||
|
||||
if ok := t.Run("StartSecondJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody2)))
|
||||
recorder := httptest.NewRecorder()
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
if recorder.Result().StatusCode != http.StatusCreated {
|
||||
t.Fatal(recorder.Result().Status, recorder.Body.String())
|
||||
}
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
|
||||
// Step 4: Stop the second job WITHOUT syncing first.
|
||||
// Before the fix, this would fail because Find() on the job table would
|
||||
// match the old failed job (jobId=999) and reject with "already stopped".
|
||||
const stopJobBody2 string = `{
|
||||
"jobId": 999,
|
||||
"startTime": 300000000,
|
||||
"cluster": "testcluster",
|
||||
"jobState": "completed",
|
||||
"stopTime": 300001000
|
||||
}`
|
||||
|
||||
t.Run("StopSecondJobBeforeSync", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody2)))
|
||||
recorder := httptest.NewRecorder()
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
if recorder.Result().StatusCode != http.StatusOK {
|
||||
t.Fatalf("expected stop to succeed for cached job, got: %s %s",
|
||||
recorder.Result().Status, recorder.Body.String())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ import (
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// GetClustersAPIResponse model
|
||||
@@ -27,7 +27,7 @@ type GetClustersAPIResponse struct {
|
||||
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
|
||||
// @produce json
|
||||
// @param cluster query string false "Job Cluster"
|
||||
// @success 200 {object} api.GetClustersApiResponse "Array of clusters"
|
||||
// @success 200 {object} api.GetClustersAPIResponse "Array of clusters"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -36,9 +36,9 @@ type GetClustersAPIResponse struct {
|
||||
// @router /api/clusters/ [get]
|
||||
func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
!user.HasRole(schema.RoleAPI) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
1235
internal/api/docs.go
1235
internal/api/docs.go
File diff suppressed because it is too large
Load Diff
@@ -22,12 +22,12 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/go-chi/chi/v5"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -72,6 +72,14 @@ type EditMetaRequest struct {
|
||||
Value string `json:"value" example:"bash script"`
|
||||
}
|
||||
|
||||
// JobMetaRequest model
|
||||
type JobMetaRequest struct {
|
||||
JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job
|
||||
Cluster *string `json:"cluster" example:"fritz"` // Cluster of job
|
||||
StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch
|
||||
Payload EditMetaRequest `json:"payload"` // Content to Add to Job Meta_Data
|
||||
}
|
||||
|
||||
type TagJobAPIRequest []*APITag
|
||||
|
||||
type GetJobAPIRequest []string
|
||||
@@ -104,7 +112,7 @@ type JobMetricWithName struct {
|
||||
// @param items-per-page query int false "Items per page (Default: 25)"
|
||||
// @param page query int false "Page Number (Default: 1)"
|
||||
// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response"
|
||||
// @success 200 {object} api.GetJobsApiResponse "Job array and page info"
|
||||
// @success 200 {object} api.GetJobsAPIResponse "Job array and page info"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -232,7 +240,7 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @param all-metrics query bool false "Include all available metrics"
|
||||
// @success 200 {object} api.GetJobApiResponse "Job resource"
|
||||
// @success 200 {object} api.GetJobAPIResponse "Job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -243,17 +251,17 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
// @router /api/jobs/{id} [get]
|
||||
func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
// Fetch job from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
id := chi.URLParam(r, "id")
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if ok {
|
||||
if id != "" {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID
|
||||
job, err = api.JobRepository.FindByID(r.Context(), id) // Get Job from Repo by ID
|
||||
} else {
|
||||
handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
@@ -293,7 +301,7 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
|
||||
if r.URL.Query().Get("all-metrics") == "true" {
|
||||
data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution)
|
||||
data, err = metricdispatch.LoadData(job, nil, scopes, r.Context(), resolution)
|
||||
if err != nil {
|
||||
cclog.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster)
|
||||
return
|
||||
@@ -324,8 +332,8 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @param request body api.GetJobApiRequest true "Array of metric names"
|
||||
// @success 200 {object} api.GetJobApiResponse "Job resource"
|
||||
// @param request body api.GetJobAPIRequest true "Array of metric names"
|
||||
// @success 200 {object} api.GetJobAPIResponse "Job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -336,17 +344,17 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
|
||||
// @router /api/jobs/{id} [post]
|
||||
func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
// Fetch job from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
id := chi.URLParam(r, "id")
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if ok {
|
||||
if id != "" {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(r.Context(), id)
|
||||
job, err = api.JobRepository.FindByID(r.Context(), id)
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
@@ -389,7 +397,7 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
resolution = max(resolution, mc.Timestep)
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution)
|
||||
data, err := metricdispatch.LoadData(job, metrics, scopes, r.Context(), resolution)
|
||||
if err != nil {
|
||||
cclog.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster)
|
||||
return
|
||||
@@ -423,29 +431,29 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
// editMeta godoc
|
||||
// @summary Edit meta-data json
|
||||
// @summary Edit meta-data json of job identified by database id
|
||||
// @tags Job add and modify
|
||||
// @description Edit key value pairs in job metadata json
|
||||
// @description Edit key value pairs in job metadata json of job specified by database id
|
||||
// @description If a key already exists its content will be overwritten
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Job Database ID"
|
||||
// @param request body api.EditMetaRequest true "Kay value pair to add"
|
||||
// @param request body api.EditMetaRequest true "Metadata Key value pair to add or update"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 404 {object} api.ErrorResponse "Job does not exist"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/edit_meta/{id} [post]
|
||||
// @router /api/jobs/edit_meta/{id} [patch]
|
||||
func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
job, err := api.JobRepository.FindByID(r.Context(), id)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
|
||||
return
|
||||
@@ -469,6 +477,54 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// editMetaByRequest godoc
|
||||
// @summary Edit meta-data json of job identified by request
|
||||
// @tags Job add and modify
|
||||
// @description Edit key value pairs in metadata json of job specified by jobID, StartTime and Cluster
|
||||
// @description If a key already exists its content will be overwritten
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body api.JobMetaRequest true "Specifies job and payload to add or update"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 404 {object} api.ErrorResponse "Job does not exist"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/edit_meta/ [patch]
|
||||
func (api *RestAPI) editMetaByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// Parse request body
|
||||
req := JobMetaRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch job (that will have its meta_data edited) from db
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if req.JobId == nil {
|
||||
handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// log.Printf("loading db job for editMetaByRequest... : JobMetaRequest=%v", req)
|
||||
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if err := api.JobRepository.UpdateMetadata(job, req.Payload.Key, req.Payload.Value); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(job)
|
||||
}
|
||||
|
||||
// tagJob godoc
|
||||
// @summary Adds one or more tags to a job
|
||||
// @tags Job add and modify
|
||||
@@ -478,7 +534,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Job Database ID"
|
||||
// @param request body api.TagJobApiRequest true "Array of tag-objects to add"
|
||||
// @param request body api.TagJobAPIRequest true "Array of tag-objects to add"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
@@ -487,13 +543,13 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/tag_job/{id} [post]
|
||||
func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
job, err := api.JobRepository.FindByID(r.Context(), id)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
|
||||
return
|
||||
@@ -542,7 +598,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Job Database ID"
|
||||
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
|
||||
// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
@@ -551,13 +607,13 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/tag_job/{id} [delete]
|
||||
func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
job, err := api.JobRepository.FindByID(r.Context(), id)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
|
||||
return
|
||||
@@ -606,7 +662,7 @@ func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @description Tag wills be removed from respective archive files.
|
||||
// @accept json
|
||||
// @produce plain
|
||||
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
|
||||
// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
|
||||
// @success 200 {string} string "Success Response"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
@@ -650,7 +706,7 @@ func (api *RestAPI) removeTags(rw http.ResponseWriter, r *http.Request) {
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body schema.Job true "Job to add"
|
||||
// @success 201 {object} api.DefaultApiResponse "Job added successfully"
|
||||
// @success 201 {object} api.DefaultAPIResponse "Job added successfully"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -691,13 +747,21 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
for _, job := range jobs {
|
||||
// Check if jobs are within the same day (prevent duplicates)
|
||||
if (req.StartTime - job.StartTime) < secondsPerDay {
|
||||
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
|
||||
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", *job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
id, err := api.JobRepository.Start(&req)
|
||||
// When tags are present, insert directly into the job table so that the
|
||||
// returned ID can be used with AddTagOrCreate (which queries the job table).
|
||||
// Jobs without tags use the cache path as before.
|
||||
var id int64
|
||||
if len(req.Tags) > 0 {
|
||||
id, err = api.JobRepository.StartDirect(&req)
|
||||
} else {
|
||||
id, err = api.JobRepository.Start(&req)
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
@@ -728,7 +792,7 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @description Job to stop is specified by request body. All fields are required in this case.
|
||||
// @description Returns full job resource information according to 'Job' scheme.
|
||||
// @produce json
|
||||
// @param request body api.StopJobApiRequest true "All fields required"
|
||||
// @param request body api.StopJobAPIRequest true "All fields required"
|
||||
// @success 200 {object} schema.Job "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
@@ -754,20 +818,20 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
// cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req)
|
||||
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
|
||||
isCached := false
|
||||
job, err = api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
// Try cached jobs if not found in main repository
|
||||
cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
|
||||
if cachedErr != nil {
|
||||
// Combine both errors for better debugging
|
||||
handleError(fmt.Errorf("finding job failed: %w (cached lookup also failed: %v)", err, cachedErr), http.StatusNotFound, rw)
|
||||
// Not in cache, try main job table
|
||||
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
|
||||
return
|
||||
}
|
||||
job = cachedJob
|
||||
} else {
|
||||
isCached = true
|
||||
}
|
||||
|
||||
api.checkAndHandleStopJob(rw, job, req)
|
||||
api.checkAndHandleStopJob(rw, job, req, isCached)
|
||||
}
|
||||
|
||||
// deleteJobByID godoc
|
||||
@@ -776,7 +840,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// @description Job to remove is specified by database ID. This will not remove the job from the job archive.
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @success 200 {object} api.DefaultAPIResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -787,16 +851,16 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// @router /api/jobs/delete_job/{id} [delete]
|
||||
func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
id := chi.URLParam(r, "id")
|
||||
var err error
|
||||
if ok {
|
||||
if id != "" {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
err = api.JobRepository.DeleteJobById(id)
|
||||
err = api.JobRepository.DeleteJobByID(id)
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
@@ -820,8 +884,8 @@ func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
// @description Job to delete is specified by request body. All fields are required in this case.
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body api.DeleteJobApiRequest true "All fields required"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @param request body api.DeleteJobAPIRequest true "All fields required"
|
||||
// @success 200 {object} api.DefaultAPIResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -852,7 +916,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
err = api.JobRepository.DeleteJobById(*job.ID)
|
||||
err = api.JobRepository.DeleteJobByID(*job.ID)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
@@ -861,7 +925,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{
|
||||
Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
|
||||
Message: fmt.Sprintf("Successfully deleted job %d", *job.ID),
|
||||
}); err != nil {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
@@ -873,7 +937,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.
|
||||
// @produce json
|
||||
// @param ts path int true "Unix epoch timestamp"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @success 200 {object} api.DefaultAPIResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -886,9 +950,9 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
||||
var cnt int
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["ts"]
|
||||
id := chi.URLParam(r, "ts")
|
||||
var err error
|
||||
if ok {
|
||||
if id != "" {
|
||||
ts, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw)
|
||||
@@ -896,11 +960,13 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
// Check for omit-tagged query parameter
|
||||
omitTagged := false
|
||||
omitTagged := "none"
|
||||
if omitTaggedStr := r.URL.Query().Get("omit-tagged"); omitTaggedStr != "" {
|
||||
omitTagged, e = strconv.ParseBool(omitTaggedStr)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("boolean expected for omit-tagged parameter: %w", e), http.StatusBadRequest, rw)
|
||||
switch omitTaggedStr {
|
||||
case "none", "all", "user":
|
||||
omitTagged = omitTaggedStr
|
||||
default:
|
||||
handleError(fmt.Errorf("omit-tagged must be one of: none, all, user"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -924,20 +990,20 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest) {
|
||||
func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest, isCached bool) {
|
||||
// Sanity checks
|
||||
if job.State != schema.JobStateRunning {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, *job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if job.StartTime > req.StopTime {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, *job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if req.State != "" && !req.State.Valid() {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, *job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
|
||||
return
|
||||
} else if req.State == "" {
|
||||
req.State = schema.JobStateCompleted
|
||||
@@ -949,14 +1015,24 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
|
||||
api.JobRepository.Mutex.Lock()
|
||||
defer api.JobRepository.Mutex.Unlock()
|
||||
|
||||
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
|
||||
// If the job is still in job_cache, transfer it to the job table first
|
||||
// so that job.ID always points to the job table for downstream code
|
||||
if isCached {
|
||||
newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : transferring cached job failed: %w", job.JobID, *job.ID, job.Cluster, err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
cclog.Infof("transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
|
||||
job.ID = &newID
|
||||
}
|
||||
|
||||
cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||
|
||||
// Send a response (with status OK). This means that errors that happen from here on forward
|
||||
// can *NOT* be communicated to the client. If reading from a MetricDataRepository or
|
||||
@@ -977,7 +1053,7 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
|
||||
}
|
||||
|
||||
func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
id := mux.Vars(r)["id"]
|
||||
id := chi.URLParam(r, "id")
|
||||
metrics := r.URL.Query()["metric"]
|
||||
var scopes []schema.MetricScope
|
||||
for _, scope := range r.URL.Query()["scope"] {
|
||||
@@ -1022,3 +1098,57 @@ func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// GetUsedNodesAPIResponse model
|
||||
type GetUsedNodesAPIResponse struct {
|
||||
UsedNodes map[string][]string `json:"usedNodes"` // Map of cluster names to lists of used node hostnames
|
||||
}
|
||||
|
||||
// getUsedNodes godoc
|
||||
// @summary Lists used nodes by cluster
|
||||
// @tags Job query
|
||||
// @description Get a map of cluster names to lists of unique hostnames that are currently in use by running jobs that started before the specified timestamp.
|
||||
// @produce json
|
||||
// @param ts query int true "Unix timestamp to filter jobs (jobs with start_time < ts)"
|
||||
// @success 200 {object} api.GetUsedNodesAPIResponse "Map of cluster names to hostname lists"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/used_nodes [get]
|
||||
func (api *RestAPI) getUsedNodes(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleAPI) {
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
tsStr := r.URL.Query().Get("ts")
|
||||
if tsStr == "" {
|
||||
handleError(fmt.Errorf("missing required query parameter: ts"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(tsStr, 10, 64)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("invalid timestamp format: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
usedNodes, err := api.JobRepository.GetUsedNodes(ts)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("failed to get used nodes: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
payload := GetUsedNodesAPIResponse{
|
||||
UsedNodes: usedNodes,
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(rw).Encode(payload); err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
165
internal/api/log.go
Normal file
165
internal/api/log.go
Normal file
@@ -0,0 +1,165 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package api
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
type LogEntry struct {
|
||||
Timestamp string `json:"timestamp"`
|
||||
Priority int `json:"priority"`
|
||||
Message string `json:"message"`
|
||||
Unit string `json:"unit"`
|
||||
}
|
||||
|
||||
var safePattern = regexp.MustCompile(`^[a-zA-Z0-9 :\-\.]+$`)
|
||||
|
||||
func (api *RestAPI) getJournalLog(rw http.ResponseWriter, r *http.Request) {
|
||||
user := repository.GetUserFromContext(r.Context())
|
||||
if !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to view logs"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
since := r.URL.Query().Get("since")
|
||||
if since == "" {
|
||||
since = "1 hour ago"
|
||||
}
|
||||
if !safePattern.MatchString(since) {
|
||||
handleError(fmt.Errorf("invalid 'since' parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
lines := 200
|
||||
if l := r.URL.Query().Get("lines"); l != "" {
|
||||
n, err := strconv.Atoi(l)
|
||||
if err != nil || n < 1 {
|
||||
handleError(fmt.Errorf("invalid 'lines' parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if n > 1000 {
|
||||
n = 1000
|
||||
}
|
||||
lines = n
|
||||
}
|
||||
|
||||
unit := config.Keys.SystemdUnit
|
||||
if unit == "" {
|
||||
unit = "clustercockpit.service"
|
||||
}
|
||||
|
||||
args := []string{
|
||||
"--output=json",
|
||||
"--no-pager",
|
||||
"-n", fmt.Sprintf("%d", lines),
|
||||
"--since", since,
|
||||
"-u", unit,
|
||||
}
|
||||
|
||||
if level := r.URL.Query().Get("level"); level != "" {
|
||||
n, err := strconv.Atoi(level)
|
||||
if err != nil || n < 0 || n > 7 {
|
||||
handleError(fmt.Errorf("invalid 'level' parameter (must be 0-7)"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
args = append(args, "--priority", fmt.Sprintf("%d", n))
|
||||
}
|
||||
|
||||
if search := r.URL.Query().Get("search"); search != "" {
|
||||
if !safePattern.MatchString(search) {
|
||||
handleError(fmt.Errorf("invalid 'search' parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
args = append(args, "--grep", search)
|
||||
}
|
||||
|
||||
cclog.Debugf("calling journalctl with %s", strings.Join(args, " "))
|
||||
cmd := exec.CommandContext(r.Context(), "journalctl", args...)
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("failed to create pipe: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
handleError(fmt.Errorf("failed to start journalctl: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
entries := make([]LogEntry, 0, lines)
|
||||
scanner := bufio.NewScanner(stdout)
|
||||
for scanner.Scan() {
|
||||
var raw map[string]any
|
||||
if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil {
|
||||
cclog.Debugf("error unmarshal log output: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
priority := 6 // default info
|
||||
if p, ok := raw["PRIORITY"]; ok {
|
||||
switch v := p.(type) {
|
||||
case string:
|
||||
if n, err := strconv.Atoi(v); err == nil {
|
||||
priority = n
|
||||
}
|
||||
case float64:
|
||||
priority = int(v)
|
||||
}
|
||||
}
|
||||
|
||||
msg := ""
|
||||
if m, ok := raw["MESSAGE"]; ok {
|
||||
if s, ok := m.(string); ok {
|
||||
msg = s
|
||||
}
|
||||
}
|
||||
|
||||
ts := ""
|
||||
if t, ok := raw["__REALTIME_TIMESTAMP"]; ok {
|
||||
if s, ok := t.(string); ok {
|
||||
ts = s
|
||||
}
|
||||
}
|
||||
|
||||
unitName := ""
|
||||
if u, ok := raw["_SYSTEMD_UNIT"]; ok {
|
||||
if s, ok := u.(string); ok {
|
||||
unitName = s
|
||||
}
|
||||
}
|
||||
|
||||
entries = append(entries, LogEntry{
|
||||
Timestamp: ts,
|
||||
Priority: priority,
|
||||
Message: msg,
|
||||
Unit: unitName,
|
||||
})
|
||||
}
|
||||
|
||||
if err := cmd.Wait(); err != nil {
|
||||
// journalctl returns exit code 1 when --grep matches nothing
|
||||
if len(entries) == 0 {
|
||||
cclog.Debugf("journalctl exited with: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
rw.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(rw).Encode(entries); err != nil {
|
||||
cclog.Errorf("Failed to encode log entries: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -15,10 +15,10 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
|
||||
"github.com/influxdata/line-protocol/v2/lineprotocol"
|
||||
"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
|
||||
)
|
||||
|
||||
// handleFree godoc
|
||||
@@ -58,7 +58,7 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
ms := metricstore.GetMemoryStore()
|
||||
n := 0
|
||||
for _, sel := range selectors {
|
||||
bn, err := ms.Free(sel, to)
|
||||
@@ -97,9 +97,9 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
ms := metricstore.GetMemoryStore()
|
||||
dec := lineprotocol.NewDecoderWithBytes(bytes)
|
||||
if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
|
||||
if err := metricstore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
|
||||
cclog.Errorf("/api/write error: %s", err.Error())
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
@@ -129,42 +129,9 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
selector = strings.Split(raw, ":")
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
ms := metricstore.GetMemoryStore()
|
||||
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// handleHealthCheck godoc
|
||||
// @summary HealthCheck endpoint
|
||||
// @tags healthcheck
|
||||
// @description This endpoint allows the users to check if a node is healthy
|
||||
// @produce json
|
||||
// @param selector query string false "Selector"
|
||||
// @success 200 {string} string "Debug dump"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /healthcheck/ [get]
|
||||
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
|
||||
rawCluster := r.URL.Query().Get("cluster")
|
||||
rawNode := r.URL.Query().Get("node")
|
||||
|
||||
if rawCluster == "" || rawNode == "" {
|
||||
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
|
||||
selector := []string{rawCluster, rawNode}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
400
internal/api/nats.go
Normal file
400
internal/api/nats.go
Normal file
@@ -0,0 +1,400 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/nats"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/receivers"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
influx "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
|
||||
)
|
||||
|
||||
// NatsAPI provides NATS subscription-based handlers for Job and Node operations.
|
||||
// It mirrors the functionality of the REST API but uses NATS messaging with
|
||||
// InfluxDB line protocol as the message format.
|
||||
//
|
||||
// # Message Format
|
||||
//
|
||||
// All NATS messages use InfluxDB line protocol format (https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/)
|
||||
// with the following structure:
|
||||
//
|
||||
// measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp
|
||||
//
|
||||
// # Job Events
|
||||
//
|
||||
// Job start/stop events use the "job" measurement with a "function" tag to distinguish operations:
|
||||
//
|
||||
// job,function=start_job event="{...JSON payload...}" <timestamp>
|
||||
// job,function=stop_job event="{...JSON payload...}" <timestamp>
|
||||
//
|
||||
// The JSON payload in the "event" field follows the schema.Job or StopJobAPIRequest structure.
|
||||
//
|
||||
// Example job start message:
|
||||
//
|
||||
// job,function=start_job event="{\"jobId\":1001,\"user\":\"testuser\",\"cluster\":\"testcluster\",...}" 1234567890000000000
|
||||
//
|
||||
// # Node State Events
|
||||
//
|
||||
// Node state updates use the "nodestate" measurement with cluster information:
|
||||
//
|
||||
// nodestate event="{...JSON payload...}" <timestamp>
|
||||
//
|
||||
// The JSON payload follows the UpdateNodeStatesRequest structure.
|
||||
//
|
||||
// Example node state message:
|
||||
//
|
||||
// nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node01\",\"states\":[\"idle\"]}]}" 1234567890000000000
|
||||
type NatsAPI struct {
|
||||
JobRepository *repository.JobRepository
|
||||
// RepositoryMutex protects job creation operations from race conditions
|
||||
// when checking for duplicate jobs during startJob calls.
|
||||
RepositoryMutex sync.Mutex
|
||||
}
|
||||
|
||||
// NewNatsAPI creates a new NatsAPI instance with default dependencies.
|
||||
func NewNatsAPI() *NatsAPI {
|
||||
return &NatsAPI{
|
||||
JobRepository: repository.GetJobRepository(),
|
||||
}
|
||||
}
|
||||
|
||||
// StartSubscriptions registers all NATS subscriptions for Job and Node APIs.
|
||||
// Returns an error if the NATS client is not available or subscription fails.
|
||||
func (api *NatsAPI) StartSubscriptions() error {
|
||||
client := nats.GetClient()
|
||||
if client == nil {
|
||||
cclog.Warn("NATS client not available, skipping API subscriptions")
|
||||
return nil
|
||||
}
|
||||
|
||||
if config.Keys.APISubjects != nil {
|
||||
|
||||
s := config.Keys.APISubjects
|
||||
|
||||
if err := client.Subscribe(s.SubjectJobEvent, api.handleJobEvent); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := client.Subscribe(s.SubjectNodeState, api.handleNodeState); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cclog.Info("NATS API subscriptions started")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// processJobEvent routes job event messages to the appropriate handler based on the "function" tag.
|
||||
// Validates that required tags and fields are present before processing.
|
||||
func (api *NatsAPI) processJobEvent(msg lp.CCMessage) {
|
||||
function, ok := msg.GetTag("function")
|
||||
if !ok {
|
||||
cclog.Errorf("Job event is missing required tag 'function': measurement=%s", msg.Name())
|
||||
return
|
||||
}
|
||||
|
||||
switch function {
|
||||
case "start_job":
|
||||
v, ok := msg.GetEventValue()
|
||||
if !ok {
|
||||
cclog.Errorf("Job start event is missing event field with JSON payload")
|
||||
return
|
||||
}
|
||||
api.handleStartJob(v)
|
||||
|
||||
case "stop_job":
|
||||
v, ok := msg.GetEventValue()
|
||||
if !ok {
|
||||
cclog.Errorf("Job stop event is missing event field with JSON payload")
|
||||
return
|
||||
}
|
||||
api.handleStopJob(v)
|
||||
|
||||
default:
|
||||
cclog.Warnf("Unknown job event function '%s', expected 'start_job' or 'stop_job'", function)
|
||||
}
|
||||
}
|
||||
|
||||
// handleJobEvent processes job-related messages received via NATS using InfluxDB line protocol.
|
||||
// The message must be in line protocol format with measurement="job" and include:
|
||||
// - tag "function" with value "start_job" or "stop_job"
|
||||
// - field "event" containing JSON payload (schema.Job or StopJobAPIRequest)
|
||||
//
|
||||
// Example: job,function=start_job event="{\"jobId\":1001,...}" 1234567890000000000
|
||||
func (api *NatsAPI) handleJobEvent(subject string, data []byte) {
|
||||
if len(data) == 0 {
|
||||
cclog.Warnf("NATS %s: received empty message", subject)
|
||||
return
|
||||
}
|
||||
|
||||
d := influx.NewDecoderWithBytes(data)
|
||||
|
||||
for d.Next() {
|
||||
m, err := receivers.DecodeInfluxMessage(d)
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
|
||||
return
|
||||
}
|
||||
|
||||
if !m.IsEvent() {
|
||||
cclog.Debugf("NATS %s: received non-event message, skipping", subject)
|
||||
continue
|
||||
}
|
||||
|
||||
if m.Name() == "job" {
|
||||
api.processJobEvent(m)
|
||||
} else {
|
||||
cclog.Debugf("NATS %s: unexpected measurement name '%s', expected 'job'", subject, m.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleStartJob processes job start messages received via NATS.
|
||||
// The payload parameter contains JSON following the schema.Job structure.
|
||||
// Jobs are validated, checked for duplicates, and inserted into the database.
|
||||
func (api *NatsAPI) handleStartJob(payload string) {
|
||||
if payload == "" {
|
||||
cclog.Error("NATS start job: payload is empty")
|
||||
return
|
||||
}
|
||||
req := schema.Job{
|
||||
Shared: "none",
|
||||
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
|
||||
}
|
||||
|
||||
dec := json.NewDecoder(strings.NewReader(payload))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&req); err != nil {
|
||||
cclog.Errorf("NATS start job: parsing request failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Debugf("NATS start job: %s", req.GoString())
|
||||
req.State = schema.JobStateRunning
|
||||
|
||||
if err := importer.SanityChecks(&req); err != nil {
|
||||
cclog.Errorf("NATS start job: sanity check failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
var unlockOnce sync.Once
|
||||
api.RepositoryMutex.Lock()
|
||||
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||
|
||||
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
cclog.Errorf("NATS start job: checking for duplicate failed: %v", err)
|
||||
return
|
||||
}
|
||||
if err == nil {
|
||||
for _, job := range jobs {
|
||||
if (req.StartTime - job.StartTime) < secondsPerDay {
|
||||
cclog.Errorf("NATS start job: job with jobId %d, cluster %s already exists (dbid: %d)",
|
||||
req.JobID, req.Cluster, job.ID)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// When tags are present, insert directly into the job table so that the
|
||||
// returned ID can be used with AddTagOrCreate (which queries the job table).
|
||||
var id int64
|
||||
if len(req.Tags) > 0 {
|
||||
id, err = api.JobRepository.StartDirect(&req)
|
||||
} else {
|
||||
id, err = api.JobRepository.Start(&req)
|
||||
}
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS start job: insert into database failed: %v", err)
|
||||
return
|
||||
}
|
||||
unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||
|
||||
for _, tag := range req.Tags {
|
||||
if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||
cclog.Errorf("NATS start job: adding tag to new job %d failed: %v", id, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Infof("NATS: new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
|
||||
id, req.Cluster, req.JobID, req.User, req.StartTime)
|
||||
}
|
||||
|
||||
// handleStopJob processes job stop messages received via NATS.
|
||||
// The payload parameter contains JSON following the StopJobAPIRequest structure.
|
||||
// The job is marked as stopped in the database and archiving is triggered if monitoring is enabled.
|
||||
func (api *NatsAPI) handleStopJob(payload string) {
|
||||
if payload == "" {
|
||||
cclog.Error("NATS stop job: payload is empty")
|
||||
return
|
||||
}
|
||||
var req StopJobAPIRequest
|
||||
|
||||
dec := json.NewDecoder(strings.NewReader(payload))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&req); err != nil {
|
||||
cclog.Errorf("NATS job stop: parsing request failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if req.JobID == nil {
|
||||
cclog.Errorf("NATS job stop: the field 'jobId' is required")
|
||||
return
|
||||
}
|
||||
|
||||
isCached := false
|
||||
job, err := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
// Not in cache, try main job table
|
||||
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS job stop: finding job failed: %v", err)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
isCached = true
|
||||
}
|
||||
|
||||
if job.State != schema.JobStateRunning {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
|
||||
job.JobID, job.ID, job.Cluster, job.State)
|
||||
return
|
||||
}
|
||||
|
||||
if job.StartTime > req.StopTime {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
|
||||
job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
|
||||
return
|
||||
}
|
||||
|
||||
if req.State != "" && !req.State.Valid() {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: invalid job state: %#v",
|
||||
job.JobID, job.ID, job.Cluster, req.State)
|
||||
return
|
||||
} else if req.State == "" {
|
||||
req.State = schema.JobStateCompleted
|
||||
}
|
||||
|
||||
job.Duration = int32(req.StopTime - job.StartTime)
|
||||
job.State = req.State
|
||||
api.JobRepository.Mutex.Lock()
|
||||
defer api.JobRepository.Mutex.Unlock()
|
||||
|
||||
// If the job is still in job_cache, transfer it to the job table first
|
||||
if isCached {
|
||||
newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v",
|
||||
job.JobID, *job.ID, job.Cluster, err)
|
||||
return
|
||||
}
|
||||
cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
|
||||
job.ID = &newID
|
||||
}
|
||||
|
||||
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
|
||||
job.JobID, *job.ID, job.Cluster, job.State, err)
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s",
|
||||
*job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||
|
||||
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
|
||||
return
|
||||
}
|
||||
|
||||
archiver.TriggerArchiving(job)
|
||||
}
|
||||
|
||||
// processNodestateEvent extracts and processes node state data from the InfluxDB message.
|
||||
// Updates node states in the repository for all nodes in the payload.
|
||||
func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
|
||||
v, ok := msg.GetEventValue()
|
||||
if !ok {
|
||||
cclog.Errorf("Nodestate event is missing event field with JSON payload")
|
||||
return
|
||||
}
|
||||
|
||||
var req UpdateNodeStatesRequest
|
||||
|
||||
dec := json.NewDecoder(strings.NewReader(v))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&req); err != nil {
|
||||
cclog.Errorf("NATS nodestate: parsing request failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
repo := repository.GetNodeRepository()
|
||||
requestReceived := time.Now().Unix()
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: requestReceived,
|
||||
NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
GpusAllocated: node.GpusAllocated,
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
JobsRunning: node.JobsRunning,
|
||||
}
|
||||
|
||||
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
|
||||
cclog.Errorf("NATS nodestate: updating node state for %s on %s failed: %v",
|
||||
node.Hostname, req.Cluster, err)
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Debugf("NATS nodestate: updated %d node states for cluster %s", len(req.Nodes), req.Cluster)
|
||||
}
|
||||
|
||||
// handleNodeState processes node state update messages received via NATS using InfluxDB line protocol.
|
||||
// The message must be in line protocol format with measurement="nodestate" and include:
|
||||
// - field "event" containing JSON payload (UpdateNodeStatesRequest)
|
||||
//
|
||||
// Example: nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[...]}" 1234567890000000000
|
||||
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
|
||||
if len(data) == 0 {
|
||||
cclog.Warnf("NATS %s: received empty message", subject)
|
||||
return
|
||||
}
|
||||
|
||||
d := influx.NewDecoderWithBytes(data)
|
||||
|
||||
for d.Next() {
|
||||
m, err := receivers.DecodeInfluxMessage(d)
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
|
||||
return
|
||||
}
|
||||
|
||||
if !m.IsEvent() {
|
||||
cclog.Warnf("NATS %s: received non-event message, skipping", subject)
|
||||
continue
|
||||
}
|
||||
|
||||
if m.Name() == "nodestate" {
|
||||
api.processNodestateEvent(m)
|
||||
} else {
|
||||
cclog.Warnf("NATS %s: unexpected measurement name '%s', expected 'nodestate'", subject, m.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
947
internal/api/nats_test.go
Normal file
947
internal/api/nats_test.go
Normal file
@@ -0,0 +1,947 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
func setupNatsTest(t *testing.T) *NatsAPI {
|
||||
repository.ResetConnection()
|
||||
|
||||
const testconfig = `{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"api-allowed-ips": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
}
|
||||
}
|
||||
}`
|
||||
const testclusterJSON = `{
|
||||
"name": "testcluster",
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "sc1",
|
||||
"nodes": "host123,host124,host125",
|
||||
"processorType": "Intel Core i7-4770",
|
||||
"socketsPerNode": 1,
|
||||
"coresPerSocket": 4,
|
||||
"threadsPerCore": 2,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 14
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 112
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "B/s"
|
||||
},
|
||||
"value": 24
|
||||
},
|
||||
"numberOfNodes": 70,
|
||||
"topology": {
|
||||
"node": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||
"memoryDomain": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||
"die": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||
"core": [[0], [1], [2], [3], [4], [5], [6], [7]]
|
||||
}
|
||||
}
|
||||
],
|
||||
"metricConfig": [
|
||||
{
|
||||
"name": "load_one",
|
||||
"unit": { "base": ""},
|
||||
"scope": "node",
|
||||
"timestep": 60,
|
||||
"aggregation": "avg",
|
||||
"peak": 8,
|
||||
"normal": 0,
|
||||
"caution": 0,
|
||||
"alert": 0
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
cclog.Init("info", true)
|
||||
tmpdir := t.TempDir()
|
||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||
if err := os.Mkdir(jobarchive, 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||
err := repository.MigrateDB(dbfilepath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ccconf.Init(cfgFilePath)
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
config.Init(cfg)
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
repository.Connect(dbfilepath)
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// metricstore initialization removed - it's initialized via callback in tests
|
||||
|
||||
archiver.Start(repository.GetJobRepository(), context.Background())
|
||||
|
||||
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||
auth.Init(&cfg)
|
||||
} else {
|
||||
cclog.Warn("Authentication disabled due to missing configuration")
|
||||
auth.Init(nil)
|
||||
}
|
||||
|
||||
graph.Init()
|
||||
|
||||
return NewNatsAPI()
|
||||
}
|
||||
|
||||
func cleanupNatsTest() {
|
||||
if err := archiver.Shutdown(5 * time.Second); err != nil {
|
||||
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleStartJob(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
payload string
|
||||
expectError bool
|
||||
validateJob func(t *testing.T, job *schema.Job)
|
||||
shouldFindJob bool
|
||||
}{
|
||||
{
|
||||
name: "valid job start",
|
||||
payload: `{
|
||||
"jobId": 1001,
|
||||
"user": "testuser1",
|
||||
"project": "testproj1",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 7200,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567890
|
||||
}`,
|
||||
expectError: false,
|
||||
shouldFindJob: true,
|
||||
validateJob: func(t *testing.T, job *schema.Job) {
|
||||
if job.JobID != 1001 {
|
||||
t.Errorf("expected JobID 1001, got %d", job.JobID)
|
||||
}
|
||||
if job.User != "testuser1" {
|
||||
t.Errorf("expected user testuser1, got %s", job.User)
|
||||
}
|
||||
if job.State != schema.JobStateRunning {
|
||||
t.Errorf("expected state running, got %s", job.State)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "invalid JSON",
|
||||
payload: `{
|
||||
"jobId": "not a number",
|
||||
"user": "testuser2"
|
||||
}`,
|
||||
expectError: true,
|
||||
shouldFindJob: false,
|
||||
},
|
||||
{
|
||||
name: "missing required fields",
|
||||
payload: `{
|
||||
"jobId": 1002
|
||||
}`,
|
||||
expectError: true,
|
||||
shouldFindJob: false,
|
||||
},
|
||||
{
|
||||
name: "job with unknown fields (should fail due to DisallowUnknownFields)",
|
||||
payload: `{
|
||||
"jobId": 1003,
|
||||
"user": "testuser3",
|
||||
"project": "testproj3",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"unknownField": "should cause error",
|
||||
"startTime": 1234567900
|
||||
}`,
|
||||
expectError: true,
|
||||
shouldFindJob: false,
|
||||
},
|
||||
{
|
||||
name: "job with tags",
|
||||
payload: `{
|
||||
"jobId": 1004,
|
||||
"user": "testuser4",
|
||||
"project": "testproj4",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"type": "test",
|
||||
"name": "testtag",
|
||||
"scope": "testuser4"
|
||||
}
|
||||
],
|
||||
"startTime": 1234567910
|
||||
}`,
|
||||
expectError: false,
|
||||
shouldFindJob: true,
|
||||
validateJob: func(t *testing.T, job *schema.Job) {
|
||||
if job.JobID != 1004 {
|
||||
t.Errorf("expected JobID 1004, got %d", job.JobID)
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.handleStartJob(tt.payload)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
|
||||
// Allow some time for async operations
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
if tt.shouldFindJob {
|
||||
// Extract jobId from payload
|
||||
var payloadMap map[string]any
|
||||
json.Unmarshal([]byte(tt.payload), &payloadMap)
|
||||
jobID := int64(payloadMap["jobId"].(float64))
|
||||
cluster := payloadMap["cluster"].(string)
|
||||
startTime := int64(payloadMap["startTime"].(float64))
|
||||
|
||||
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, &startTime)
|
||||
if err != nil {
|
||||
if !tt.expectError {
|
||||
t.Fatalf("expected to find job, but got error: %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if tt.validateJob != nil {
|
||||
tt.validateJob(t, job)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleStopJob(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
// First, create a running job
|
||||
startPayload := `{
|
||||
"jobId": 2001,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567890
|
||||
}`
|
||||
|
||||
natsAPI.handleStartJob(startPayload)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
payload string
|
||||
expectError bool
|
||||
validateJob func(t *testing.T, job *schema.Job)
|
||||
setupJobFunc func() // Optional: create specific test job
|
||||
}{
|
||||
{
|
||||
name: "valid job stop - completed",
|
||||
payload: `{
|
||||
"jobId": 2001,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567890,
|
||||
"jobState": "completed",
|
||||
"stopTime": 1234571490
|
||||
}`,
|
||||
expectError: false,
|
||||
validateJob: func(t *testing.T, job *schema.Job) {
|
||||
if job.State != schema.JobStateCompleted {
|
||||
t.Errorf("expected state completed, got %s", job.State)
|
||||
}
|
||||
expectedDuration := int32(1234571490 - 1234567890)
|
||||
if job.Duration != expectedDuration {
|
||||
t.Errorf("expected duration %d, got %d", expectedDuration, job.Duration)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "valid job stop - failed",
|
||||
setupJobFunc: func() {
|
||||
startPayloadFailed := `{
|
||||
"jobId": 2002,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567900
|
||||
}`
|
||||
natsAPI.handleStartJob(startPayloadFailed)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
},
|
||||
payload: `{
|
||||
"jobId": 2002,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567900,
|
||||
"jobState": "failed",
|
||||
"stopTime": 1234569900
|
||||
}`,
|
||||
expectError: false,
|
||||
validateJob: func(t *testing.T, job *schema.Job) {
|
||||
if job.State != schema.JobStateFailed {
|
||||
t.Errorf("expected state failed, got %s", job.State)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "invalid JSON",
|
||||
payload: `{
|
||||
"jobId": "not a number"
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "missing jobId",
|
||||
payload: `{
|
||||
"cluster": "testcluster",
|
||||
"jobState": "completed",
|
||||
"stopTime": 1234571490
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "invalid job state",
|
||||
setupJobFunc: func() {
|
||||
startPayloadInvalid := `{
|
||||
"jobId": 2003,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567910
|
||||
}`
|
||||
natsAPI.handleStartJob(startPayloadInvalid)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
},
|
||||
payload: `{
|
||||
"jobId": 2003,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567910,
|
||||
"jobState": "invalid_state",
|
||||
"stopTime": 1234571510
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "stopTime before startTime",
|
||||
setupJobFunc: func() {
|
||||
startPayloadTime := `{
|
||||
"jobId": 2004,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567920
|
||||
}`
|
||||
natsAPI.handleStartJob(startPayloadTime)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
},
|
||||
payload: `{
|
||||
"jobId": 2004,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567920,
|
||||
"jobState": "completed",
|
||||
"stopTime": 1234567900
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "job not found",
|
||||
payload: `{
|
||||
"jobId": 99999,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567890,
|
||||
"jobState": "completed",
|
||||
"stopTime": 1234571490
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
}
|
||||
|
||||
testData := schema.JobData{
|
||||
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||
schema.MetricScopeNode: {
|
||||
Unit: schema.Unit{Base: "load"},
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "host123",
|
||||
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
|
||||
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
return testData, nil
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if tt.setupJobFunc != nil {
|
||||
tt.setupJobFunc()
|
||||
}
|
||||
|
||||
natsAPI.handleStopJob(tt.payload)
|
||||
|
||||
// Allow some time for async operations
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
if !tt.expectError && tt.validateJob != nil {
|
||||
// Extract job details from payload
|
||||
var payloadMap map[string]any
|
||||
json.Unmarshal([]byte(tt.payload), &payloadMap)
|
||||
jobID := int64(payloadMap["jobId"].(float64))
|
||||
cluster := payloadMap["cluster"].(string)
|
||||
|
||||
var startTime *int64
|
||||
if st, ok := payloadMap["startTime"]; ok {
|
||||
t := int64(st.(float64))
|
||||
startTime = &t
|
||||
}
|
||||
|
||||
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, startTime)
|
||||
if err != nil {
|
||||
t.Fatalf("expected to find job, but got error: %v", err)
|
||||
}
|
||||
|
||||
tt.validateJob(t, job)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleNodeState(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
expectError bool
|
||||
validateFn func(t *testing.T)
|
||||
}{
|
||||
{
|
||||
name: "valid node state update",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"allocated\"],\"cpusAllocated\":8,\"memoryAllocated\":16384,\"gpusAllocated\":0,\"jobsRunning\":1}]}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
validateFn: func(t *testing.T) {
|
||||
// In a full test, we would verify the node state was updated in the database
|
||||
// For now, just ensure no error occurred
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "multiple nodes",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"host124\",\"states\":[\"allocated\"],\"cpusAllocated\":4,\"memoryAllocated\":8192,\"gpusAllocated\":1,\"jobsRunning\":1}]}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
},
|
||||
{
|
||||
name: "invalid JSON in event field",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":\"not an array\"}" 1234567890000000000`),
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "empty nodes array",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1234567890000000000`),
|
||||
expectError: false, // Empty array should not cause error
|
||||
},
|
||||
{
|
||||
name: "invalid line protocol format",
|
||||
data: []byte(`invalid line protocol format`),
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "empty data",
|
||||
data: []byte(``),
|
||||
expectError: false, // Should be handled gracefully with warning
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.handleNodeState("test.subject", tt.data)
|
||||
|
||||
// Allow some time for async operations
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
if tt.validateFn != nil {
|
||||
tt.validateFn(t)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsProcessJobEvent(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
msgStartJob, err := lp.NewMessage(
|
||||
"job",
|
||||
map[string]string{"function": "start_job"},
|
||||
nil,
|
||||
map[string]any{
|
||||
"event": `{
|
||||
"jobId": 3001,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567890
|
||||
}`,
|
||||
},
|
||||
time.Now(),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create test message: %v", err)
|
||||
}
|
||||
|
||||
msgMissingTag, err := lp.NewMessage(
|
||||
"job",
|
||||
map[string]string{},
|
||||
nil,
|
||||
map[string]any{
|
||||
"event": `{}`,
|
||||
},
|
||||
time.Now(),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create test message: %v", err)
|
||||
}
|
||||
|
||||
msgUnknownFunc, err := lp.NewMessage(
|
||||
"job",
|
||||
map[string]string{"function": "unknown_function"},
|
||||
nil,
|
||||
map[string]any{
|
||||
"event": `{}`,
|
||||
},
|
||||
time.Now(),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create test message: %v", err)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
message lp.CCMessage
|
||||
expectError bool
|
||||
}{
|
||||
{
|
||||
name: "start_job function",
|
||||
message: msgStartJob,
|
||||
expectError: false,
|
||||
},
|
||||
{
|
||||
name: "missing function tag",
|
||||
message: msgMissingTag,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "unknown function",
|
||||
message: msgUnknownFunc,
|
||||
expectError: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.processJobEvent(tt.message)
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleJobEvent(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
expectError bool
|
||||
}{
|
||||
{
|
||||
name: "valid influx line protocol",
|
||||
data: []byte(`job,function=start_job event="{\"jobId\":4001,\"user\":\"testuser\",\"project\":\"testproj\",\"cluster\":\"testcluster\",\"partition\":\"main\",\"walltime\":3600,\"numNodes\":1,\"numHwthreads\":8,\"numAcc\":0,\"shared\":\"none\",\"monitoringStatus\":1,\"smt\":1,\"resources\":[{\"hostname\":\"host123\",\"hwthreads\":[0,1,2,3]}],\"startTime\":1234567890}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
},
|
||||
{
|
||||
name: "invalid influx line protocol",
|
||||
data: []byte(`invalid line protocol format`),
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "empty data",
|
||||
data: []byte(``),
|
||||
expectError: false, // Decoder should handle empty input gracefully
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// HandleJobEvent doesn't return errors, it logs them
|
||||
// We're just ensuring it doesn't panic
|
||||
natsAPI.handleJobEvent("test.subject", tt.data)
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleJobEventEdgeCases(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
expectError bool
|
||||
description string
|
||||
}{
|
||||
{
|
||||
name: "non-event message (metric data)",
|
||||
data: []byte(`job,function=start_job value=123.45 1234567890000000000`),
|
||||
expectError: false,
|
||||
description: "Should skip non-event messages gracefully",
|
||||
},
|
||||
{
|
||||
name: "wrong measurement name",
|
||||
data: []byte(`wrongmeasurement,function=start_job event="{}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
description: "Should warn about unexpected measurement but not fail",
|
||||
},
|
||||
{
|
||||
name: "missing event field",
|
||||
data: []byte(`job,function=start_job other_field="value" 1234567890000000000`),
|
||||
expectError: true,
|
||||
description: "Should error when event field is missing",
|
||||
},
|
||||
{
|
||||
name: "multiple measurements in one message",
|
||||
data: []byte("job,function=start_job event=\"{}\" 1234567890000000000\njob,function=stop_job event=\"{}\" 1234567890000000000"),
|
||||
expectError: false,
|
||||
description: "Should process multiple lines",
|
||||
},
|
||||
{
|
||||
name: "escaped quotes in JSON payload",
|
||||
data: []byte(`job,function=start_job event="{\"jobId\":6001,\"user\":\"test\\\"user\",\"cluster\":\"test\"}" 1234567890000000000`),
|
||||
expectError: true,
|
||||
description: "Should handle escaped quotes (though JSON parsing may fail)",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.handleJobEvent("test.subject", tt.data)
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleNodeStateEdgeCases(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
expectError bool
|
||||
description string
|
||||
}{
|
||||
{
|
||||
name: "missing cluster field in JSON",
|
||||
data: []byte(`nodestate event="{\"nodes\":[]}" 1234567890000000000`),
|
||||
expectError: true,
|
||||
description: "Should fail when cluster is missing",
|
||||
},
|
||||
{
|
||||
name: "malformed JSON with unescaped quotes",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"test"cluster\",\"nodes\":[]}" 1234567890000000000`),
|
||||
expectError: true,
|
||||
description: "Should fail on malformed JSON",
|
||||
},
|
||||
{
|
||||
name: "unicode characters in hostname",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host-ñ123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
description: "Should handle unicode characters",
|
||||
},
|
||||
{
|
||||
name: "very large node count",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node1\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node2\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node3\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
description: "Should handle multiple nodes efficiently",
|
||||
},
|
||||
{
|
||||
name: "timestamp in past",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1000000000000000000`),
|
||||
expectError: false,
|
||||
description: "Should accept any valid timestamp",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.handleNodeState("test.subject", tt.data)
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleStartJobDuplicatePrevention(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
// Start a job
|
||||
payload := `{
|
||||
"jobId": 5001,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567890
|
||||
}`
|
||||
|
||||
natsAPI.handleStartJob(payload)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
// Try to start the same job again (within 24 hours)
|
||||
duplicatePayload := `{
|
||||
"jobId": 5001,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567900
|
||||
}`
|
||||
|
||||
natsAPI.handleStartJob(duplicatePayload)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
// Verify only one job exists
|
||||
jobID := int64(5001)
|
||||
cluster := "testcluster"
|
||||
jobs, err := natsAPI.JobRepository.FindAll(&jobID, &cluster, nil)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(jobs) != 1 {
|
||||
t.Errorf("expected 1 job, got %d", len(jobs))
|
||||
}
|
||||
}
|
||||
@@ -7,12 +7,17 @@ package api
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"maps"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
type UpdateNodeStatesRequest struct {
|
||||
@@ -20,6 +25,15 @@ type UpdateNodeStatesRequest struct {
|
||||
Cluster string `json:"cluster" example:"fritz"`
|
||||
}
|
||||
|
||||
// metricListToNames converts a map of metric configurations to a list of metric names
|
||||
func metricListToNames(metricList map[string]*schema.Metric) []string {
|
||||
names := make([]string, 0, len(metricList))
|
||||
for name := range metricList {
|
||||
names = append(names, name)
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
// this routine assumes that only one of them exists per node
|
||||
func determineState(states []string) schema.SchedulerState {
|
||||
for _, state := range states {
|
||||
@@ -47,7 +61,7 @@ func determineState(states []string) schema.SchedulerState {
|
||||
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||
// @produce json
|
||||
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @success 200 {object} api.DefaultAPIResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -62,19 +76,70 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
||||
http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
requestReceived := time.Now().Unix()
|
||||
repo := repository.GetNodeRepository()
|
||||
|
||||
m := make(map[string][]string)
|
||||
metricNames := make(map[string][]string)
|
||||
healthResults := make(map[string]metricstore.HealthCheckResult)
|
||||
|
||||
startMs := time.Now()
|
||||
|
||||
// Step 1: Build nodeList and metricList per subcluster
|
||||
for _, node := range req.Nodes {
|
||||
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||
m[sc] = append(m[sc], node.Hostname)
|
||||
}
|
||||
}
|
||||
|
||||
for sc := range m {
|
||||
if sc != "" {
|
||||
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
|
||||
metricNames[sc] = metricListToNames(metricList)
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Determine which metric store to query and perform health check
|
||||
healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
|
||||
if err != nil {
|
||||
cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
|
||||
} else {
|
||||
for sc, nl := range m {
|
||||
if sc != "" {
|
||||
if results, err := healthRepo.HealthCheck(req.Cluster, nl, metricNames[sc]); err == nil {
|
||||
maps.Copy(healthResults, results)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Debugf("Timer updateNodeStates, MemStore HealthCheck: %s", time.Since(startMs))
|
||||
startDB := time.Now()
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
healthState := schema.MonitoringStateFailed
|
||||
var healthMetrics string
|
||||
if result, ok := healthResults[node.Hostname]; ok {
|
||||
healthState = result.State
|
||||
healthMetrics = result.HealthMetrics
|
||||
}
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: time.Now().Unix(), NodeState: state,
|
||||
TimeStamp: requestReceived,
|
||||
NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
GpusAllocated: node.GpusAllocated,
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
HealthState: healthState,
|
||||
HealthMetrics: healthMetrics,
|
||||
JobsRunning: node.JobsRunning,
|
||||
}
|
||||
|
||||
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
|
||||
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
|
||||
cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v",
|
||||
node.Hostname, req.Cluster, err)
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB))
|
||||
}
|
||||
|
||||
@@ -22,10 +22,11 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/tagger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||
"github.com/go-chi/chi/v5"
|
||||
)
|
||||
|
||||
// @title ClusterCockpit REST API
|
||||
@@ -48,6 +49,7 @@ import (
|
||||
const (
|
||||
noticeFilePath = "./var/notice.txt"
|
||||
noticeFilePerms = 0o644
|
||||
maxNoticeLength = 10000 // Maximum allowed notice content length in characters
|
||||
)
|
||||
|
||||
type RestAPI struct {
|
||||
@@ -61,6 +63,7 @@ type RestAPI struct {
|
||||
RepositoryMutex sync.Mutex
|
||||
}
|
||||
|
||||
// New creates and initializes a new RestAPI instance with configured dependencies.
|
||||
func New() *RestAPI {
|
||||
return &RestAPI{
|
||||
JobRepository: repository.GetJobRepository(),
|
||||
@@ -69,79 +72,100 @@ func New() *RestAPI {
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
// MountAPIRoutes registers REST API endpoints for job and cluster management.
|
||||
// These routes use JWT token authentication via the X-Auth-Token header.
|
||||
func (api *RestAPI) MountAPIRoutes(r chi.Router) {
|
||||
// REST API Uses TokenAuth
|
||||
// User List
|
||||
r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
|
||||
r.Get("/users/", api.getUsers)
|
||||
// Cluster List
|
||||
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
|
||||
r.Get("/clusters/", api.getClusters)
|
||||
// Slurm node state
|
||||
r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
|
||||
r.Post("/nodestate/", api.updateNodeStates)
|
||||
r.Put("/nodestate/", api.updateNodeStates)
|
||||
// Job Handler
|
||||
r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost)
|
||||
r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
|
||||
r.HandleFunc("/jobs/tag_job/{id}", api.removeTagJob).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/jobs/edit_meta/{id}", api.editMeta).Methods(http.MethodPost, http.MethodPatch)
|
||||
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/delete_job/", api.deleteJobByRequest).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/jobs/delete_job/{id}", api.deleteJobByID).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/jobs/delete_job_before/{ts}", api.deleteJobBefore).Methods(http.MethodDelete)
|
||||
if config.Keys.APISubjects == nil {
|
||||
cclog.Info("Enabling REST start/stop job API")
|
||||
r.Post("/jobs/start_job/", api.startJob)
|
||||
r.Put("/jobs/start_job/", api.startJob)
|
||||
r.Post("/jobs/stop_job/", api.stopJobByRequest)
|
||||
r.Put("/jobs/stop_job/", api.stopJobByRequest)
|
||||
}
|
||||
r.Get("/jobs/", api.getJobs)
|
||||
r.Get("/jobs/used_nodes", api.getUsedNodes)
|
||||
r.Post("/jobs/tag_job/{id}", api.tagJob)
|
||||
r.Patch("/jobs/tag_job/{id}", api.tagJob)
|
||||
r.Delete("/jobs/tag_job/{id}", api.removeTagJob)
|
||||
r.Patch("/jobs/edit_meta/{id}", api.editMeta)
|
||||
r.Patch("/jobs/edit_meta/", api.editMetaByRequest)
|
||||
r.Get("/jobs/metrics/{id}", api.getJobMetrics)
|
||||
r.Delete("/jobs/delete_job/", api.deleteJobByRequest)
|
||||
r.Delete("/jobs/delete_job/{id}", api.deleteJobByID)
|
||||
r.Delete("/jobs/delete_job_before/{ts}", api.deleteJobBefore)
|
||||
r.Post("/jobs/{id}", api.getJobByID)
|
||||
r.Get("/jobs/{id}", api.getCompleteJobByID)
|
||||
|
||||
r.HandleFunc("/tags/", api.removeTags).Methods(http.MethodDelete)
|
||||
r.Delete("/tags/", api.removeTags)
|
||||
|
||||
if api.MachineStateDir != "" {
|
||||
r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet)
|
||||
r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost)
|
||||
r.Get("/machine_state/{cluster}/{host}", api.getMachineState)
|
||||
r.Put("/machine_state/{cluster}/{host}", api.putMachineState)
|
||||
r.Post("/machine_state/{cluster}/{host}", api.putMachineState)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
// MountUserAPIRoutes registers user-accessible REST API endpoints.
|
||||
// These are limited endpoints for regular users with JWT token authentication.
|
||||
func (api *RestAPI) MountUserAPIRoutes(r chi.Router) {
|
||||
// REST API Uses TokenAuth
|
||||
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost)
|
||||
r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
|
||||
r.Get("/jobs/", api.getJobs)
|
||||
r.Post("/jobs/{id}", api.getJobByID)
|
||||
r.Get("/jobs/{id}", api.getCompleteJobByID)
|
||||
r.Get("/jobs/metrics/{id}", api.getJobMetrics)
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
|
||||
// MountMetricStoreAPIRoutes registers metric storage API endpoints.
|
||||
// These endpoints handle metric data ingestion and health checks with JWT token authentication.
|
||||
func (api *RestAPI) MountMetricStoreAPIRoutes(r chi.Router) {
|
||||
// REST API Uses TokenAuth
|
||||
// Note: StrictSlash handles trailing slash variations automatically
|
||||
r.HandleFunc("/api/free", freeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/api/write", writeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/api/debug", debugMetrics).Methods(http.MethodGet)
|
||||
r.HandleFunc("/api/healthcheck", metricsHealth).Methods(http.MethodGet)
|
||||
r.Post("/free", freeMetrics)
|
||||
r.Post("/write", writeMetrics)
|
||||
r.Get("/debug", debugMetrics)
|
||||
r.Post("/healthcheck", api.updateNodeStates)
|
||||
// Same endpoints but with trailing slash
|
||||
r.HandleFunc("/api/free/", freeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/api/write/", writeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/api/debug/", debugMetrics).Methods(http.MethodGet)
|
||||
r.HandleFunc("/api/healthcheck/", metricsHealth).Methods(http.MethodGet)
|
||||
r.Post("/free/", freeMetrics)
|
||||
r.Post("/write/", writeMetrics)
|
||||
r.Get("/debug/", debugMetrics)
|
||||
r.Post("/healthcheck/", api.updateNodeStates)
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
// MountConfigAPIRoutes registers configuration and user management endpoints.
|
||||
// These routes use session-based authentication and require admin privileges.
|
||||
// Routes use full paths (including /config prefix) to avoid conflicting with
|
||||
// the /config page route when registered via Group instead of Route.
|
||||
func (api *RestAPI) MountConfigAPIRoutes(r chi.Router) {
|
||||
// Settings Frontend Uses SessionAuth
|
||||
if api.Authentication != nil {
|
||||
r.HandleFunc("/roles/", api.getRoles).Methods(http.MethodGet)
|
||||
r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
|
||||
r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost)
|
||||
r.HandleFunc("/notice/", api.editNotice).Methods(http.MethodPost)
|
||||
r.Get("/config/roles/", api.getRoles)
|
||||
r.Post("/config/users/", api.createUser)
|
||||
r.Put("/config/users/", api.createUser)
|
||||
r.Get("/config/users/", api.getUsers)
|
||||
r.Delete("/config/users/", api.deleteUser)
|
||||
r.Post("/config/user/{id}", api.updateUser)
|
||||
r.Post("/config/notice/", api.editNotice)
|
||||
r.Get("/config/taggers/", api.getTaggers)
|
||||
r.Post("/config/taggers/run/", api.runTagger)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
// MountFrontendAPIRoutes registers frontend-specific API endpoints.
|
||||
// These routes support JWT generation and user configuration updates with session authentication.
|
||||
func (api *RestAPI) MountFrontendAPIRoutes(r chi.Router) {
|
||||
r.Get("/logs/", api.getJournalLog)
|
||||
// Settings Frontend Uses SessionAuth
|
||||
if api.Authentication != nil {
|
||||
r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet)
|
||||
r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost)
|
||||
r.Get("/jwt/", api.getJWT)
|
||||
r.Post("/configuration/", api.updateConfiguration)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,6 +181,8 @@ type DefaultAPIResponse struct {
|
||||
Message string `json:"msg"`
|
||||
}
|
||||
|
||||
// handleError writes a standardized JSON error response with the given status code.
|
||||
// It logs the error at WARN level and ensures proper Content-Type headers are set.
|
||||
func handleError(err error, statusCode int, rw http.ResponseWriter) {
|
||||
cclog.Warnf("REST ERROR : %s", err.Error())
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
@@ -169,15 +195,38 @@ func handleError(err error, statusCode int, rw http.ResponseWriter) {
|
||||
}
|
||||
}
|
||||
|
||||
// decode reads JSON from r into val with strict validation that rejects unknown fields.
|
||||
func decode(r io.Reader, val any) error {
|
||||
dec := json.NewDecoder(r)
|
||||
dec.DisallowUnknownFields()
|
||||
return dec.Decode(val)
|
||||
}
|
||||
|
||||
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
// validatePathComponent checks if a path component contains potentially malicious patterns
|
||||
// that could be used for path traversal attacks. Returns an error if validation fails.
|
||||
func validatePathComponent(component, componentName string) error {
|
||||
if strings.Contains(component, "..") ||
|
||||
strings.Contains(component, "/") ||
|
||||
strings.Contains(component, "\\") {
|
||||
return fmt.Errorf("invalid %s", componentName)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// editNotice godoc
|
||||
// @summary Update system notice
|
||||
// @tags Config
|
||||
// @description Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.
|
||||
// @accept mpfd
|
||||
// @produce plain
|
||||
// @param new-content formData string true "New notice content (max 10000 characters)"
|
||||
// @success 200 {string} string "Update Notice Content Success"
|
||||
// @failure 400 {object} ErrorResponse "Bad Request"
|
||||
// @failure 403 {object} ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /notice/ [post]
|
||||
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to update the notice.txt file"), http.StatusForbidden, rw)
|
||||
return
|
||||
@@ -186,9 +235,8 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
// Get Value
|
||||
newContent := r.FormValue("new-content")
|
||||
|
||||
// Validate content length to prevent DoS
|
||||
if len(newContent) > 10000 {
|
||||
handleError(fmt.Errorf("notice content exceeds maximum length of 10000 characters"), http.StatusBadRequest, rw)
|
||||
if len(newContent) > maxNoticeLength {
|
||||
handleError(fmt.Errorf("notice content exceeds maximum length of %d characters", maxNoticeLength), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -200,7 +248,9 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
handleError(fmt.Errorf("creating notice file failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
ntxt.Close()
|
||||
if err := ntxt.Close(); err != nil {
|
||||
cclog.Warnf("Failed to close notice file: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile(noticeFilePath, []byte(newContent), noticeFilePerms); err != nil {
|
||||
@@ -210,13 +260,66 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
var msg []byte
|
||||
if newContent != "" {
|
||||
rw.Write([]byte("Update Notice Content Success"))
|
||||
msg = []byte("Update Notice Content Success")
|
||||
} else {
|
||||
rw.Write([]byte("Empty Notice Content Success"))
|
||||
msg = []byte("Empty Notice Content Success")
|
||||
}
|
||||
if _, err := rw.Write(msg); err != nil {
|
||||
cclog.Errorf("Failed to write response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) getTaggers(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to list taggers"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(rw).Encode(tagger.ListTaggers()); err != nil {
|
||||
cclog.Errorf("Failed to encode tagger list: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) runTagger(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to run taggers"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
name := r.FormValue("name")
|
||||
if name == "" {
|
||||
handleError(fmt.Errorf("missing required parameter: name"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if err := tagger.RunTaggerByName(name); err != nil {
|
||||
handleError(err, http.StatusConflict, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
if _, err := rw.Write([]byte(fmt.Sprintf("Tagger %s started", name))); err != nil {
|
||||
cclog.Errorf("Failed to write response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// getJWT godoc
|
||||
// @summary Generate JWT token
|
||||
// @tags Frontend
|
||||
// @description Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.
|
||||
// @accept mpfd
|
||||
// @produce plain
|
||||
// @param username formData string true "Username to generate JWT for"
|
||||
// @success 200 {string} string "JWT token"
|
||||
// @failure 403 {object} ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} ErrorResponse "User Not Found"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /jwt/ [get]
|
||||
func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
username := r.FormValue("username")
|
||||
@@ -241,12 +344,22 @@ func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
rw.Write([]byte(jwt))
|
||||
if _, err := rw.Write([]byte(jwt)); err != nil {
|
||||
cclog.Errorf("Failed to write JWT response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// getRoles godoc
|
||||
// @summary Get available roles
|
||||
// @tags Config
|
||||
// @description Returns a list of valid user roles. Only admins are allowed.
|
||||
// @produce json
|
||||
// @success 200 {array} string "List of role names"
|
||||
// @failure 403 {object} ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /roles/ [get]
|
||||
func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
user := repository.GetUserFromContext(r.Context())
|
||||
if !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to fetch a list of roles"), http.StatusForbidden, rw)
|
||||
@@ -265,6 +378,18 @@ func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// updateConfiguration godoc
|
||||
// @summary Update user configuration
|
||||
// @tags Frontend
|
||||
// @description Updates a user's configuration key-value pair.
|
||||
// @accept mpfd
|
||||
// @produce plain
|
||||
// @param key formData string true "Configuration key"
|
||||
// @param value formData string true "Configuration value"
|
||||
// @success 200 {string} string "success"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /configuration/ [post]
|
||||
func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
key, value := r.FormValue("key"), r.FormValue("value")
|
||||
@@ -275,26 +400,40 @@ func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
rw.Write([]byte("success"))
|
||||
if _, err := rw.Write([]byte("success")); err != nil {
|
||||
cclog.Errorf("Failed to write response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// putMachineState godoc
|
||||
// @summary Store machine state
|
||||
// @tags Machine State
|
||||
// @description Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
|
||||
// @accept json
|
||||
// @produce plain
|
||||
// @param cluster path string true "Cluster name"
|
||||
// @param host path string true "Host name"
|
||||
// @success 201 "Created"
|
||||
// @failure 400 {object} ErrorResponse "Bad Request"
|
||||
// @failure 404 {object} ErrorResponse "Machine state not enabled"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /machine_state/{cluster}/{host} [put]
|
||||
func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
|
||||
if api.MachineStateDir == "" {
|
||||
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
|
||||
return
|
||||
}
|
||||
|
||||
vars := mux.Vars(r)
|
||||
cluster := vars["cluster"]
|
||||
host := vars["host"]
|
||||
cluster := chi.URLParam(r, "cluster")
|
||||
host := chi.URLParam(r, "host")
|
||||
|
||||
// Validate cluster and host to prevent path traversal attacks
|
||||
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
|
||||
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
|
||||
if err := validatePathComponent(cluster, "cluster name"); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
|
||||
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
|
||||
if err := validatePathComponent(host, "host name"); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -320,23 +459,33 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.WriteHeader(http.StatusCreated)
|
||||
}
|
||||
|
||||
// getMachineState godoc
|
||||
// @summary Retrieve machine state
|
||||
// @tags Machine State
|
||||
// @description Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
|
||||
// @produce json
|
||||
// @param cluster path string true "Cluster name"
|
||||
// @param host path string true "Host name"
|
||||
// @success 200 {object} object "Machine state JSON data"
|
||||
// @failure 400 {object} ErrorResponse "Bad Request"
|
||||
// @failure 404 {object} ErrorResponse "Machine state not enabled or file not found"
|
||||
// @security ApiKeyAuth
|
||||
// @router /machine_state/{cluster}/{host} [get]
|
||||
func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) {
|
||||
if api.MachineStateDir == "" {
|
||||
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
|
||||
return
|
||||
}
|
||||
|
||||
vars := mux.Vars(r)
|
||||
cluster := vars["cluster"]
|
||||
host := vars["host"]
|
||||
cluster := chi.URLParam(r, "cluster")
|
||||
host := chi.URLParam(r, "host")
|
||||
|
||||
// Validate cluster and host to prevent path traversal attacks
|
||||
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
|
||||
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
|
||||
if err := validatePathComponent(cluster, "cluster name"); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
|
||||
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
|
||||
if err := validatePathComponent(host, "host name"); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -11,9 +11,9 @@ import (
|
||||
"net/http"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/go-chi/chi/v5"
|
||||
)
|
||||
|
||||
type APIReturnedUser struct {
|
||||
@@ -31,7 +31,7 @@ type APIReturnedUser struct {
|
||||
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||
// @produce json
|
||||
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
|
||||
// @success 200 {array} api.ApiReturnedUser "List of users returned successfully"
|
||||
// @success 200 {array} api.APIReturnedUser "List of users returned successfully"
|
||||
// @failure 400 {string} string "Bad Request"
|
||||
// @failure 401 {string} string "Unauthorized"
|
||||
// @failure 403 {string} string "Forbidden"
|
||||
@@ -91,7 +91,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
|
||||
// Handle role updates
|
||||
if newrole != "" {
|
||||
if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
|
||||
if err := repository.GetUserRepository().AddRole(r.Context(), chi.URLParam(r, "id"), newrole); err != nil {
|
||||
handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
@@ -99,7 +99,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else if delrole != "" {
|
||||
if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil {
|
||||
if err := repository.GetUserRepository().RemoveRole(r.Context(), chi.URLParam(r, "id"), delrole); err != nil {
|
||||
handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
@@ -107,7 +107,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else if newproj != "" {
|
||||
if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil {
|
||||
if err := repository.GetUserRepository().AddProject(r.Context(), chi.URLParam(r, "id"), newproj); err != nil {
|
||||
handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
@@ -115,7 +115,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else if delproj != "" {
|
||||
if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil {
|
||||
if err := repository.GetUserRepository().RemoveProject(r.Context(), chi.URLParam(r, "id"), delproj); err != nil {
|
||||
handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
@@ -164,7 +164,7 @@ func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) {
|
||||
if len(password) == 0 && role != schema.GetRoleString(schema.RoleAPI) {
|
||||
handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -106,7 +106,7 @@ Data is archived at the highest available resolution (typically 60s intervals).
|
||||
|
||||
```go
|
||||
// In archiver.go ArchiveJob() function
|
||||
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 300)
|
||||
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 300)
|
||||
// 0 = highest resolution
|
||||
// 300 = 5-minute resolution
|
||||
```
|
||||
@@ -170,7 +170,6 @@ All exported functions are safe for concurrent use:
|
||||
- `Start()` - Safe to call once
|
||||
- `TriggerArchiving()` - Safe from multiple goroutines
|
||||
- `Shutdown()` - Safe to call once
|
||||
- `WaitForArchiving()` - Deprecated, but safe
|
||||
|
||||
Internal state is protected by:
|
||||
- Channel synchronization (`archiveChannel`)
|
||||
@@ -185,6 +184,6 @@ Internal state is protected by:
|
||||
## Dependencies
|
||||
|
||||
- `internal/repository`: Database operations for job metadata
|
||||
- `internal/metricDataDispatcher`: Loading metric data from various backends
|
||||
- `internal/metricdispatch`: Loading metric data from various backends
|
||||
- `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
|
||||
- `cc-lib/schema`: Job and metric data structures
|
||||
|
||||
@@ -54,8 +54,8 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
@@ -126,7 +126,7 @@ func archivingWorker() {
|
||||
// not using meta data, called to load JobMeta into Cache?
|
||||
// will fail if job meta not in repository
|
||||
if _, err := jobRepo.FetchMetadata(job); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", *job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
archivePending.Done()
|
||||
continue
|
||||
@@ -136,7 +136,7 @@ func archivingWorker() {
|
||||
// Use shutdown context to allow cancellation
|
||||
jobMeta, err := ArchiveJob(job, shutdownCtx)
|
||||
if err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", *job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
archivePending.Done()
|
||||
continue
|
||||
@@ -145,24 +145,24 @@ func archivingWorker() {
|
||||
stmt := sq.Update("job").Where("job.id = ?", job.ID)
|
||||
|
||||
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", *job.ID, err.Error())
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", *job.ID, err.Error())
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
// Update the jobs database entry one last time:
|
||||
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
|
||||
if err := jobRepo.Execute(stmt); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", *job.ID, err.Error())
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
cclog.Infof("archiving job (dbid: %d) successful", job.ID)
|
||||
cclog.Infof("archiving job (dbid: %d) successful", *job.ID)
|
||||
|
||||
repository.CallJobStopHooks(job)
|
||||
archivePending.Done()
|
||||
|
||||
@@ -9,11 +9,10 @@ import (
|
||||
"context"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// ArchiveJob archives a completed job's metric data to the configured archive backend.
|
||||
@@ -60,7 +59,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
|
||||
scopes = append(scopes, schema.MetricScopeAccelerator)
|
||||
}
|
||||
|
||||
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
|
||||
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
|
||||
if err != nil {
|
||||
cclog.Error("Error wile loading job data for archiving")
|
||||
return nil, err
|
||||
@@ -94,12 +93,5 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// If the file based archive is disabled,
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if config.Keys.DisableArchive {
|
||||
return job, nil
|
||||
}
|
||||
|
||||
return job, archive.GetHandle().ImportJob(job, &jobData)
|
||||
}
|
||||
|
||||
@@ -25,9 +25,9 @@ import (
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||
"github.com/gorilla/sessions"
|
||||
)
|
||||
|
||||
@@ -40,7 +40,7 @@ type Authenticator interface {
|
||||
// authenticator should attempt the login. This method should not perform
|
||||
// expensive operations or actual authentication.
|
||||
CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool)
|
||||
|
||||
|
||||
// Login performs the actually authentication for the user.
|
||||
// It returns the authenticated user or an error if authentication fails.
|
||||
// The user parameter may be nil if the user doesn't exist in the database yet.
|
||||
@@ -65,13 +65,13 @@ var ipUserLimiters sync.Map
|
||||
func getIPUserLimiter(ip, username string) *rate.Limiter {
|
||||
key := ip + ":" + username
|
||||
now := time.Now()
|
||||
|
||||
|
||||
if entry, ok := ipUserLimiters.Load(key); ok {
|
||||
rle := entry.(*rateLimiterEntry)
|
||||
rle.lastUsed = now
|
||||
return rle.limiter
|
||||
}
|
||||
|
||||
|
||||
// More aggressive rate limiting: 5 attempts per 15 minutes
|
||||
newLimiter := rate.NewLimiter(rate.Every(15*time.Minute/5), 5)
|
||||
ipUserLimiters.Store(key, &rateLimiterEntry{
|
||||
@@ -176,7 +176,7 @@ func (auth *Authentication) AuthViaSession(
|
||||
func Init(authCfg *json.RawMessage) {
|
||||
initOnce.Do(func() {
|
||||
authInstance = &Authentication{}
|
||||
|
||||
|
||||
// Start background cleanup of rate limiters
|
||||
startRateLimiterCleanup()
|
||||
|
||||
@@ -263,7 +263,7 @@ func GetAuthInstance() *Authentication {
|
||||
}
|
||||
|
||||
// handleUserSync syncs or updates a user in the database based on configuration.
|
||||
// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled.
|
||||
// This is used for LDAP, JWT and OIDC authentications when syncUserOnLogin or updateUserOnLogin is enabled.
|
||||
func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) {
|
||||
r := repository.GetUserRepository()
|
||||
dbUser, err := r.GetUser(user.Username)
|
||||
@@ -272,7 +272,7 @@ func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool)
|
||||
cclog.Errorf("Error while loading user '%s': %v", user.Username, err)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
if err == sql.ErrNoRows && syncUserOnLogin { // Add new user
|
||||
if err := r.AddUser(user); err != nil {
|
||||
cclog.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
|
||||
@@ -294,6 +294,11 @@ func handleOIDCUser(OIDCUser *schema.User) {
|
||||
handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin)
|
||||
}
|
||||
|
||||
// handleLdapUser syncs LDAP user with database
|
||||
func handleLdapUser(ldapUser *schema.User) {
|
||||
handleUserSync(ldapUser, Keys.LdapConfig.SyncUserOnLogin, Keys.LdapConfig.UpdateUserOnLogin)
|
||||
}
|
||||
|
||||
func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
|
||||
session, err := auth.sessionStore.New(r, "session")
|
||||
if err != nil {
|
||||
@@ -305,8 +310,13 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
||||
if auth.SessionMaxAge != 0 {
|
||||
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
|
||||
}
|
||||
if config.Keys.HTTPSCertFile == "" && config.Keys.HTTPSKeyFile == "" {
|
||||
cclog.Warn("HTTPS not configured - session cookies will not have Secure flag set (insecure for production)")
|
||||
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
|
||||
// If neither TLS or an encrypted reverse proxy are used, do not mark cookies as secure.
|
||||
cclog.Warn("Authenticating with unencrypted request. Session cookies will not have Secure flag set (insecure for production)")
|
||||
if r.Header.Get("X-Forwarded-Proto") == "" {
|
||||
// This warning will not be printed if e.g. X-Forwarded-Proto == http
|
||||
cclog.Warn("If you are using a reverse proxy, make sure X-Forwarded-Proto is set")
|
||||
}
|
||||
session.Options.Secure = false
|
||||
}
|
||||
session.Options.SameSite = http.SameSiteStrictMode
|
||||
@@ -438,13 +448,13 @@ func (auth *Authentication) AuthAPI(
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
if user.HasRole(schema.RoleAPI) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
|
||||
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleAPI}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
@@ -474,13 +484,13 @@ func (auth *Authentication) AuthUserAPI(
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
if user.HasRole(schema.RoleAPI) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
|
||||
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
@@ -510,13 +520,13 @@ func (auth *Authentication) AuthMetricStoreAPI(
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
if user.HasRole(schema.RoleAPI) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
|
||||
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
@@ -616,9 +626,9 @@ func securedCheck(user *schema.User, r *http.Request) error {
|
||||
}
|
||||
// If SplitHostPort fails, IPAddress is already just a host (no port)
|
||||
|
||||
// If nothing declared in config: deny all request to this api endpoint
|
||||
// If nothing declared in config: Continue
|
||||
if len(config.Keys.APIAllowedIPs) == 0 {
|
||||
return fmt.Errorf("missing configuration key ApiAllowedIPs")
|
||||
return nil
|
||||
}
|
||||
// If wildcard declared in config: Continue
|
||||
if config.Keys.APIAllowedIPs[0] == "*" {
|
||||
|
||||
@@ -15,25 +15,25 @@ import (
|
||||
func TestGetIPUserLimiter(t *testing.T) {
|
||||
ip := "192.168.1.1"
|
||||
username := "testuser"
|
||||
|
||||
|
||||
// Get limiter for the first time
|
||||
limiter1 := getIPUserLimiter(ip, username)
|
||||
if limiter1 == nil {
|
||||
t.Fatal("Expected limiter to be created")
|
||||
}
|
||||
|
||||
|
||||
// Get the same limiter again
|
||||
limiter2 := getIPUserLimiter(ip, username)
|
||||
if limiter1 != limiter2 {
|
||||
t.Error("Expected to get the same limiter instance")
|
||||
}
|
||||
|
||||
|
||||
// Get a different limiter for different user
|
||||
limiter3 := getIPUserLimiter(ip, "otheruser")
|
||||
if limiter1 == limiter3 {
|
||||
t.Error("Expected different limiter for different user")
|
||||
}
|
||||
|
||||
|
||||
// Get a different limiter for different IP
|
||||
limiter4 := getIPUserLimiter("192.168.1.2", username)
|
||||
if limiter1 == limiter4 {
|
||||
@@ -45,16 +45,16 @@ func TestGetIPUserLimiter(t *testing.T) {
|
||||
func TestRateLimiterBehavior(t *testing.T) {
|
||||
ip := "10.0.0.1"
|
||||
username := "ratelimituser"
|
||||
|
||||
|
||||
limiter := getIPUserLimiter(ip, username)
|
||||
|
||||
|
||||
// Should allow first 5 attempts
|
||||
for i := 0; i < 5; i++ {
|
||||
for i := range 5 {
|
||||
if !limiter.Allow() {
|
||||
t.Errorf("Request %d should be allowed within rate limit", i+1)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// 6th attempt should be blocked
|
||||
if limiter.Allow() {
|
||||
t.Error("Request 6 should be blocked by rate limiter")
|
||||
@@ -65,19 +65,19 @@ func TestRateLimiterBehavior(t *testing.T) {
|
||||
func TestCleanupOldRateLimiters(t *testing.T) {
|
||||
// Clear all existing limiters first to avoid interference from other tests
|
||||
cleanupOldRateLimiters(time.Now().Add(24 * time.Hour))
|
||||
|
||||
|
||||
// Create some new rate limiters
|
||||
limiter1 := getIPUserLimiter("1.1.1.1", "user1")
|
||||
limiter2 := getIPUserLimiter("2.2.2.2", "user2")
|
||||
|
||||
|
||||
if limiter1 == nil || limiter2 == nil {
|
||||
t.Fatal("Failed to create test limiters")
|
||||
}
|
||||
|
||||
|
||||
// Cleanup limiters older than 1 second from now (should keep both)
|
||||
time.Sleep(10 * time.Millisecond) // Small delay to ensure timestamp difference
|
||||
cleanupOldRateLimiters(time.Now().Add(-1 * time.Second))
|
||||
|
||||
|
||||
// Verify they still exist (should get same instance)
|
||||
if getIPUserLimiter("1.1.1.1", "user1") != limiter1 {
|
||||
t.Error("Limiter 1 was incorrectly cleaned up")
|
||||
@@ -85,10 +85,10 @@ func TestCleanupOldRateLimiters(t *testing.T) {
|
||||
if getIPUserLimiter("2.2.2.2", "user2") != limiter2 {
|
||||
t.Error("Limiter 2 was incorrectly cleaned up")
|
||||
}
|
||||
|
||||
|
||||
// Cleanup limiters older than 1 hour from now (should remove both)
|
||||
cleanupOldRateLimiters(time.Now().Add(2 * time.Hour))
|
||||
|
||||
|
||||
// Getting them again should create new instances
|
||||
newLimiter1 := getIPUserLimiter("1.1.1.1", "user1")
|
||||
if newLimiter1 == limiter1 {
|
||||
@@ -107,14 +107,14 @@ func TestIPv4Extraction(t *testing.T) {
|
||||
{"IPv4 without port", "192.168.1.1", "192.168.1.1"},
|
||||
{"Localhost with port", "127.0.0.1:3000", "127.0.0.1"},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.input
|
||||
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||
result = host
|
||||
}
|
||||
|
||||
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
@@ -122,7 +122,7 @@ func TestIPv4Extraction(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestIPv6Extraction tests extracting IPv6 addresses
|
||||
// TestIPv6Extraction tests extracting IPv6 addresses
|
||||
func TestIPv6Extraction(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -134,14 +134,14 @@ func TestIPv6Extraction(t *testing.T) {
|
||||
{"IPv6 without port", "2001:db8::1", "2001:db8::1"},
|
||||
{"IPv6 localhost", "::1", "::1"},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.input
|
||||
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||
result = host
|
||||
}
|
||||
|
||||
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
@@ -160,14 +160,14 @@ func TestIPExtractionEdgeCases(t *testing.T) {
|
||||
{"Empty string", "", ""},
|
||||
{"Just port", ":8080", ""},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.input
|
||||
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||
result = host
|
||||
}
|
||||
|
||||
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
|
||||
@@ -14,8 +14,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -25,20 +25,20 @@ type JWTAuthConfig struct {
|
||||
MaxAge string `json:"max-age"`
|
||||
|
||||
// Specifies which cookie should be checked for a JWT token (if no authorization header is present)
|
||||
CookieName string `json:"cookieName"`
|
||||
CookieName string `json:"cookie-name"`
|
||||
|
||||
// Deny login for users not in database (but defined in JWT).
|
||||
// Ignore user roles defined in JWTs ('roles' claim), get them from db.
|
||||
ValidateUser bool `json:"validateUser"`
|
||||
ValidateUser bool `json:"validate-user"`
|
||||
|
||||
// Specifies which issuer should be accepted when validating external JWTs ('iss' claim)
|
||||
TrustedIssuer string `json:"trustedIssuer"`
|
||||
TrustedIssuer string `json:"trusted-issuer"`
|
||||
|
||||
// Should an non-existent user be added to the DB based on the information in the token
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
SyncUserOnLogin bool `json:"sync-user-on-login"`
|
||||
|
||||
// Should an existent user be updated in the DB based on the information in the token
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
UpdateUserOnLogin bool `json:"update-user-on-login"`
|
||||
}
|
||||
|
||||
type JWTAuthenticator struct {
|
||||
@@ -101,20 +101,20 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
||||
|
||||
// Token is valid, extract payload
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
|
||||
|
||||
// Use shared helper to get user from JWT claims
|
||||
var user *schema.User
|
||||
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthToken, -1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
// If not validating user, we only get roles from JWT (no projects for this auth method)
|
||||
if !Keys.JwtConfig.ValidateUser {
|
||||
user.Roles = extractRolesFromClaims(claims, false)
|
||||
user.Projects = nil // Standard JWT auth doesn't include projects
|
||||
}
|
||||
|
||||
|
||||
return user, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -146,13 +146,13 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
}
|
||||
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
|
||||
|
||||
// Use shared helper to get user from JWT claims
|
||||
user, err = getUserFromJWT(claims, jc.ValidateUser, schema.AuthSession, schema.AuthViaToken)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
// Sync or update user if configured
|
||||
if !jc.ValidateUser && (jc.SyncUserOnLogin || jc.UpdateUserOnLogin) {
|
||||
handleTokenUser(user)
|
||||
|
||||
@@ -9,10 +9,11 @@ import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -28,7 +29,7 @@ func extractStringFromClaims(claims jwt.MapClaims, key string) string {
|
||||
// If validateRoles is true, only valid roles are returned
|
||||
func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
|
||||
var roles []string
|
||||
|
||||
|
||||
if rawroles, ok := claims["roles"].([]any); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
@@ -42,14 +43,14 @@ func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return roles
|
||||
}
|
||||
|
||||
// extractProjectsFromClaims extracts projects from JWT claims
|
||||
func extractProjectsFromClaims(claims jwt.MapClaims) []string {
|
||||
projects := make([]string, 0)
|
||||
|
||||
|
||||
if rawprojs, ok := claims["projects"].([]any); ok {
|
||||
for _, pp := range rawprojs {
|
||||
if p, ok := pp.(string); ok {
|
||||
@@ -61,7 +62,7 @@ func extractProjectsFromClaims(claims jwt.MapClaims) []string {
|
||||
projects = append(projects, projSlice...)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return projects
|
||||
}
|
||||
|
||||
@@ -72,22 +73,23 @@ func extractNameFromClaims(claims jwt.MapClaims) string {
|
||||
if name, ok := claims["name"].(string); ok {
|
||||
return name
|
||||
}
|
||||
|
||||
|
||||
// Try nested structure: {name: {values: [...]}}
|
||||
if wrap, ok := claims["name"].(map[string]any); ok {
|
||||
if vals, ok := wrap["values"].([]any); ok {
|
||||
if len(vals) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%v", vals[0])
|
||||
|
||||
var name strings.Builder
|
||||
name.WriteString(fmt.Sprintf("%v", vals[0]))
|
||||
for i := 1; i < len(vals); i++ {
|
||||
name += fmt.Sprintf(" %v", vals[i])
|
||||
name.WriteString(fmt.Sprintf(" %v", vals[i]))
|
||||
}
|
||||
return name
|
||||
return name.String()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -100,7 +102,7 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut
|
||||
if sub == "" {
|
||||
return nil, errors.New("missing 'sub' claim in JWT")
|
||||
}
|
||||
|
||||
|
||||
if validateUser {
|
||||
// Validate user against database
|
||||
ur := repository.GetUserRepository()
|
||||
@@ -109,22 +111,22 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut
|
||||
cclog.Errorf("Error while loading user '%v': %v", sub, err)
|
||||
return nil, fmt.Errorf("database error: %w", err)
|
||||
}
|
||||
|
||||
|
||||
// Deny any logins for unknown usernames
|
||||
if user == nil || err == sql.ErrNoRows {
|
||||
cclog.Warn("Could not find user from JWT in internal database.")
|
||||
return nil, errors.New("unknown user")
|
||||
}
|
||||
|
||||
|
||||
// Return database user (with database roles)
|
||||
return user, nil
|
||||
}
|
||||
|
||||
|
||||
// Create user from JWT claims
|
||||
name := extractNameFromClaims(claims)
|
||||
roles := extractRolesFromClaims(claims, true) // Validate roles
|
||||
projects := extractProjectsFromClaims(claims)
|
||||
|
||||
|
||||
return &schema.User{
|
||||
Username: sub,
|
||||
Name: name,
|
||||
|
||||
@@ -8,7 +8,7 @@ package auth
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -19,7 +19,7 @@ func TestExtractStringFromClaims(t *testing.T) {
|
||||
"email": "test@example.com",
|
||||
"age": 25, // not a string
|
||||
}
|
||||
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
key string
|
||||
@@ -30,7 +30,7 @@ func TestExtractStringFromClaims(t *testing.T) {
|
||||
{"Non-existent key", "missing", ""},
|
||||
{"Non-string value", "age", ""},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractStringFromClaims(claims, tt.key)
|
||||
@@ -88,16 +88,16 @@ func TestExtractRolesFromClaims(t *testing.T) {
|
||||
expected: []string{},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractRolesFromClaims(tt.claims, tt.validateRoles)
|
||||
|
||||
|
||||
if len(result) != len(tt.expected) {
|
||||
t.Errorf("Expected %d roles, got %d", len(tt.expected), len(result))
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
for i, role := range result {
|
||||
if i >= len(tt.expected) || role != tt.expected[i] {
|
||||
t.Errorf("Expected role %s at position %d, got %s", tt.expected[i], i, role)
|
||||
@@ -141,16 +141,16 @@ func TestExtractProjectsFromClaims(t *testing.T) {
|
||||
expected: []string{"project1", "project2"}, // Should skip non-strings
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractProjectsFromClaims(tt.claims)
|
||||
|
||||
|
||||
if len(result) != len(tt.expected) {
|
||||
t.Errorf("Expected %d projects, got %d", len(tt.expected), len(result))
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
for i, project := range result {
|
||||
if i >= len(tt.expected) || project != tt.expected[i] {
|
||||
t.Errorf("Expected project %s at position %d, got %s", tt.expected[i], i, project)
|
||||
@@ -216,7 +216,7 @@ func TestExtractNameFromClaims(t *testing.T) {
|
||||
expected: "123 Smith", // Should convert to string
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractNameFromClaims(tt.claims)
|
||||
@@ -235,29 +235,28 @@ func TestGetUserFromJWT_NoValidation(t *testing.T) {
|
||||
"roles": []any{"user", "admin"},
|
||||
"projects": []any{"project1", "project2"},
|
||||
}
|
||||
|
||||
|
||||
user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
|
||||
if user.Username != "testuser" {
|
||||
t.Errorf("Expected username 'testuser', got '%s'", user.Username)
|
||||
}
|
||||
|
||||
|
||||
if user.Name != "Test User" {
|
||||
t.Errorf("Expected name 'Test User', got '%s'", user.Name)
|
||||
}
|
||||
|
||||
|
||||
if len(user.Roles) != 2 {
|
||||
t.Errorf("Expected 2 roles, got %d", len(user.Roles))
|
||||
}
|
||||
|
||||
|
||||
if len(user.Projects) != 2 {
|
||||
t.Errorf("Expected 2 projects, got %d", len(user.Projects))
|
||||
}
|
||||
|
||||
|
||||
if user.AuthType != schema.AuthToken {
|
||||
t.Errorf("Expected AuthType %v, got %v", schema.AuthToken, user.AuthType)
|
||||
}
|
||||
@@ -268,13 +267,13 @@ func TestGetUserFromJWT_MissingSub(t *testing.T) {
|
||||
claims := jwt.MapClaims{
|
||||
"name": "Test User",
|
||||
}
|
||||
|
||||
|
||||
_, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
|
||||
|
||||
|
||||
if err == nil {
|
||||
t.Error("Expected error for missing sub claim")
|
||||
}
|
||||
|
||||
|
||||
if err.Error() != "missing 'sub' claim in JWT" {
|
||||
t.Errorf("Expected specific error message, got: %v", err)
|
||||
}
|
||||
|
||||
@@ -13,8 +13,8 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -75,13 +75,13 @@ func (ja *JWTSessionAuthenticator) Login(
|
||||
}
|
||||
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
|
||||
|
||||
// Use shared helper to get user from JWT claims
|
||||
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthSession, schema.AuthViaToken)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
// Sync or update user if configured
|
||||
if !Keys.JwtConfig.ValidateUser && (Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin) {
|
||||
handleTokenUser(user)
|
||||
|
||||
@@ -6,35 +6,39 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/go-ldap/ldap/v3"
|
||||
)
|
||||
|
||||
type LdapConfig struct {
|
||||
URL string `json:"url"`
|
||||
UserBase string `json:"user_base"`
|
||||
SearchDN string `json:"search_dn"`
|
||||
UserBind string `json:"user_bind"`
|
||||
UserFilter string `json:"user_filter"`
|
||||
UserAttr string `json:"username_attr"`
|
||||
SyncInterval string `json:"sync_interval"` // Parsed using time.ParseDuration.
|
||||
SyncDelOldUsers bool `json:"sync_del_old_users"`
|
||||
UserBase string `json:"user-base"`
|
||||
SearchDN string `json:"search-dn"`
|
||||
UserBind string `json:"user-bind"`
|
||||
UserFilter string `json:"user-filter"`
|
||||
UserAttr string `json:"username-attr"`
|
||||
UIDAttr string `json:"uid-attr"`
|
||||
SyncInterval string `json:"sync-interval"` // Parsed using time.ParseDuration.
|
||||
SyncDelOldUsers bool `json:"sync-del-old-users"`
|
||||
|
||||
// Should an non-existent user be added to the DB if user exists in ldap directory
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
// Should a non-existent user be added to the DB if user exists in ldap directory
|
||||
SyncUserOnLogin bool `json:"sync-user-on-login"`
|
||||
UpdateUserOnLogin bool `json:"update-user-on-login"`
|
||||
}
|
||||
|
||||
type LdapAuthenticator struct {
|
||||
syncPassword string
|
||||
UserAttr string
|
||||
UIDAttr string
|
||||
}
|
||||
|
||||
var _ Authenticator = (*LdapAuthenticator)(nil)
|
||||
@@ -51,6 +55,12 @@ func (la *LdapAuthenticator) Init() error {
|
||||
la.UserAttr = "gecos"
|
||||
}
|
||||
|
||||
if Keys.LdapConfig.UIDAttr != "" {
|
||||
la.UIDAttr = Keys.LdapConfig.UIDAttr
|
||||
} else {
|
||||
la.UIDAttr = "uid"
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -66,55 +76,44 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
if user.AuthSource == schema.AuthViaLDAP {
|
||||
return user, true
|
||||
}
|
||||
} else {
|
||||
if lc.SyncUserOnLogin {
|
||||
l, err := la.getLdapConnection(true)
|
||||
if err != nil {
|
||||
cclog.Error("LDAP connection error")
|
||||
return nil, false
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
// Search for the given username
|
||||
searchRequest := ldap.NewSearchRequest(
|
||||
lc.UserBase,
|
||||
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
||||
fmt.Sprintf("(&%s(uid=%s))", lc.UserFilter, username),
|
||||
[]string{"dn", "uid", la.UserAttr}, nil)
|
||||
|
||||
sr, err := l.Search(searchRequest)
|
||||
if err != nil {
|
||||
cclog.Warn(err)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
if len(sr.Entries) != 1 {
|
||||
cclog.Warn("LDAP: User does not exist or too many entries returned")
|
||||
return nil, false
|
||||
}
|
||||
|
||||
entry := sr.Entries[0]
|
||||
name := entry.GetAttributeValue(la.UserAttr)
|
||||
var roles []string
|
||||
roles = append(roles, schema.GetRoleString(schema.RoleUser))
|
||||
projects := make([]string, 0)
|
||||
|
||||
user = &schema.User{
|
||||
Username: username,
|
||||
Name: name,
|
||||
Roles: roles,
|
||||
Projects: projects,
|
||||
AuthType: schema.AuthSession,
|
||||
AuthSource: schema.AuthViaLDAP,
|
||||
}
|
||||
|
||||
if err := repository.GetUserRepository().AddUser(user); err != nil {
|
||||
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
return user, true
|
||||
} else if lc.SyncUserOnLogin {
|
||||
l, err := la.getLdapConnection(true)
|
||||
if err != nil {
|
||||
cclog.Error("LDAP connection error")
|
||||
return nil, false
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
// Search for the given username
|
||||
searchRequest := ldap.NewSearchRequest(
|
||||
lc.UserBase,
|
||||
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
||||
fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UIDAttr, ldap.EscapeFilter(username)),
|
||||
[]string{"dn", la.UIDAttr, la.UserAttr}, nil)
|
||||
|
||||
sr, err := l.Search(searchRequest)
|
||||
if err != nil {
|
||||
cclog.Warn(err)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
if len(sr.Entries) != 1 {
|
||||
cclog.Warn("LDAP: User does not exist or too many entries returned")
|
||||
return nil, false
|
||||
}
|
||||
|
||||
entry := sr.Entries[0]
|
||||
user = &schema.User{
|
||||
Username: username,
|
||||
Name: entry.GetAttributeValue(la.UserAttr),
|
||||
Roles: []string{schema.GetRoleString(schema.RoleUser)},
|
||||
Projects: make([]string, 0),
|
||||
AuthType: schema.AuthSession,
|
||||
AuthSource: schema.AuthViaLDAP,
|
||||
}
|
||||
|
||||
handleLdapUser(user)
|
||||
return user, true
|
||||
}
|
||||
|
||||
return nil, false
|
||||
@@ -132,7 +131,7 @@ func (la *LdapAuthenticator) Login(
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username)
|
||||
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", ldap.EscapeDN(user.Username))
|
||||
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
|
||||
cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
|
||||
user.Username, err)
|
||||
@@ -170,7 +169,7 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
lc.UserBase,
|
||||
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
||||
lc.UserFilter,
|
||||
[]string{"dn", "uid", la.UserAttr}, nil))
|
||||
[]string{"dn", la.UIDAttr, la.UserAttr}, nil))
|
||||
if err != nil {
|
||||
cclog.Warn("LDAP search error")
|
||||
return err
|
||||
@@ -178,9 +177,9 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
|
||||
newnames := map[string]string{}
|
||||
for _, entry := range ldapResults.Entries {
|
||||
username := entry.GetAttributeValue("uid")
|
||||
username := entry.GetAttributeValue(la.UIDAttr)
|
||||
if username == "" {
|
||||
return errors.New("no attribute 'uid'")
|
||||
return fmt.Errorf("no attribute '%s'", la.UIDAttr)
|
||||
}
|
||||
|
||||
_, ok := users[username]
|
||||
@@ -194,20 +193,19 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
|
||||
for username, where := range users {
|
||||
if where == InDB && lc.SyncDelOldUsers {
|
||||
ur.DelUser(username)
|
||||
if err := ur.DelUser(username); err != nil {
|
||||
cclog.Errorf("User '%s' LDAP: Delete from DB failed: %v", username, err)
|
||||
return err
|
||||
}
|
||||
cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
|
||||
} else if where == InLdap {
|
||||
name := newnames[username]
|
||||
|
||||
var roles []string
|
||||
roles = append(roles, schema.GetRoleString(schema.RoleUser))
|
||||
projects := make([]string, 0)
|
||||
|
||||
user := &schema.User{
|
||||
Username: username,
|
||||
Name: name,
|
||||
Roles: roles,
|
||||
Projects: projects,
|
||||
Roles: []string{schema.GetRoleString(schema.RoleUser)},
|
||||
Projects: make([]string, 0),
|
||||
AuthSource: schema.AuthViaLDAP,
|
||||
}
|
||||
|
||||
@@ -224,11 +222,13 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
|
||||
func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
|
||||
lc := Keys.LdapConfig
|
||||
conn, err := ldap.DialURL(lc.URL)
|
||||
conn, err := ldap.DialURL(lc.URL,
|
||||
ldap.DialWithDialer(&net.Dialer{Timeout: 10 * time.Second}))
|
||||
if err != nil {
|
||||
cclog.Warn("LDAP URL dial failed")
|
||||
return nil, err
|
||||
}
|
||||
conn.SetTimeout(30 * time.Second)
|
||||
|
||||
if admin {
|
||||
if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {
|
||||
|
||||
@@ -9,8 +9,8 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"golang.org/x/crypto/bcrypt"
|
||||
)
|
||||
|
||||
|
||||
@@ -9,23 +9,24 @@ import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/coreos/go-oidc/v3/oidc"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/go-chi/chi/v5"
|
||||
"golang.org/x/oauth2"
|
||||
)
|
||||
|
||||
type OpenIDConfig struct {
|
||||
Provider string `json:"provider"`
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
SyncUserOnLogin bool `json:"sync-user-on-login"`
|
||||
UpdateUserOnLogin bool `json:"update-user-on-login"`
|
||||
}
|
||||
|
||||
type OIDC struct {
|
||||
@@ -50,6 +51,7 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin
|
||||
MaxAge: int(time.Hour.Seconds()),
|
||||
Secure: r.TLS != nil,
|
||||
HttpOnly: true,
|
||||
SameSite: http.SameSiteLaxMode,
|
||||
}
|
||||
http.SetCookie(w, c)
|
||||
}
|
||||
@@ -59,7 +61,7 @@ func NewOIDC(a *Authentication) *OIDC {
|
||||
// Use context with timeout for provider initialization
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
|
||||
provider, err := oidc.NewProvider(ctx, Keys.OpenIDConfig.Provider)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
@@ -77,8 +79,7 @@ func NewOIDC(a *Authentication) *OIDC {
|
||||
ClientID: clientID,
|
||||
ClientSecret: clientSecret,
|
||||
Endpoint: provider.Endpoint(),
|
||||
RedirectURL: "oidc-callback",
|
||||
Scopes: []string{oidc.ScopeOpenID, "profile", "email"},
|
||||
Scopes: []string{oidc.ScopeOpenID, "profile"},
|
||||
}
|
||||
|
||||
oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a}
|
||||
@@ -86,7 +87,7 @@ func NewOIDC(a *Authentication) *OIDC {
|
||||
return oa
|
||||
}
|
||||
|
||||
func (oa *OIDC) RegisterEndpoints(r *mux.Router) {
|
||||
func (oa *OIDC) RegisterEndpoints(r chi.Router) {
|
||||
r.HandleFunc("/oidc-login", oa.OAuth2Login)
|
||||
r.HandleFunc("/oidc-callback", oa.OAuth2Callback)
|
||||
}
|
||||
@@ -119,57 +120,96 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
||||
// Exchange authorization code for token with timeout
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
|
||||
token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier))
|
||||
if err != nil {
|
||||
http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError)
|
||||
cclog.Errorf("token exchange failed: %s", err.Error())
|
||||
http.Error(rw, "Authentication failed during token exchange", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Get user info from OIDC provider with same timeout
|
||||
userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token))
|
||||
if err != nil {
|
||||
http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError)
|
||||
cclog.Errorf("failed to get userinfo: %s", err.Error())
|
||||
http.Error(rw, "Failed to retrieve user information", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// // Extract the ID Token from OAuth2 token.
|
||||
// rawIDToken, ok := token.Extra("id_token").(string)
|
||||
// if !ok {
|
||||
// http.Error(rw, "Cannot access idToken", http.StatusInternalServerError)
|
||||
// }
|
||||
//
|
||||
// verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
|
||||
// // Parse and verify ID Token payload.
|
||||
// idToken, err := verifier.Verify(context.Background(), rawIDToken)
|
||||
// if err != nil {
|
||||
// http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError)
|
||||
// }
|
||||
// Verify ID token and nonce to prevent replay attacks
|
||||
rawIDToken, ok := token.Extra("id_token").(string)
|
||||
if !ok {
|
||||
http.Error(rw, "ID token not found in response", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
nonceCookie, err := r.Cookie("nonce")
|
||||
if err != nil {
|
||||
http.Error(rw, "nonce cookie not found", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
|
||||
idToken, err := verifier.Verify(ctx, rawIDToken)
|
||||
if err != nil {
|
||||
cclog.Errorf("ID token verification failed: %s", err.Error())
|
||||
http.Error(rw, "ID token verification failed", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if idToken.Nonce != nonceCookie.Value {
|
||||
http.Error(rw, "Nonce mismatch", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
projects := make([]string, 0)
|
||||
|
||||
// Extract custom claims
|
||||
// Extract custom claims from userinfo
|
||||
var claims struct {
|
||||
Username string `json:"preferred_username"`
|
||||
Name string `json:"name"`
|
||||
Profile struct {
|
||||
// Keycloak realm-level roles
|
||||
RealmAccess struct {
|
||||
Roles []string `json:"roles"`
|
||||
} `json:"realm_access"`
|
||||
// Keycloak client-level roles
|
||||
ResourceAccess struct {
|
||||
Client struct {
|
||||
Roles []string `json:"roles"`
|
||||
} `json:"clustercockpit"`
|
||||
} `json:"resource_access"`
|
||||
}
|
||||
if err := userInfo.Claims(&claims); err != nil {
|
||||
http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError)
|
||||
cclog.Errorf("failed to extract claims: %s", err.Error())
|
||||
http.Error(rw, "Failed to extract user claims", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if claims.Username == "" {
|
||||
http.Error(rw, "Username claim missing from OIDC provider", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Merge roles from both client-level and realm-level access
|
||||
oidcRoles := append(claims.ResourceAccess.Client.Roles, claims.RealmAccess.Roles...)
|
||||
|
||||
roleSet := make(map[string]bool)
|
||||
for _, r := range oidcRoles {
|
||||
switch r {
|
||||
case "user":
|
||||
roleSet[schema.GetRoleString(schema.RoleUser)] = true
|
||||
case "admin":
|
||||
roleSet[schema.GetRoleString(schema.RoleAdmin)] = true
|
||||
case "manager":
|
||||
roleSet[schema.GetRoleString(schema.RoleManager)] = true
|
||||
case "support":
|
||||
roleSet[schema.GetRoleString(schema.RoleSupport)] = true
|
||||
}
|
||||
}
|
||||
|
||||
var roles []string
|
||||
for _, r := range claims.Profile.Client.Roles {
|
||||
switch r {
|
||||
case "user":
|
||||
roles = append(roles, schema.GetRoleString(schema.RoleUser))
|
||||
case "admin":
|
||||
roles = append(roles, schema.GetRoleString(schema.RoleAdmin))
|
||||
}
|
||||
for role := range roleSet {
|
||||
roles = append(roles, role)
|
||||
}
|
||||
|
||||
if len(roles) == 0 {
|
||||
@@ -188,8 +228,12 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
||||
handleOIDCUser(user)
|
||||
}
|
||||
|
||||
oa.authentication.SaveSession(rw, r, user)
|
||||
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
if err := oa.authentication.SaveSession(rw, r, user); err != nil {
|
||||
cclog.Errorf("session save failed for user %q: %s", user.Username, err.Error())
|
||||
http.Error(rw, "Failed to create session", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
cclog.Infof("login successful: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx))
|
||||
}
|
||||
@@ -206,7 +250,24 @@ func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
|
||||
codeVerifier := oauth2.GenerateVerifier()
|
||||
setCallbackCookie(rw, r, "verifier", codeVerifier)
|
||||
|
||||
// Generate nonce for ID token replay protection
|
||||
nonce, err := randString(16)
|
||||
if err != nil {
|
||||
http.Error(rw, "Internal error", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
setCallbackCookie(rw, r, "nonce", nonce)
|
||||
|
||||
// Build redirect URL from the incoming request
|
||||
scheme := "https"
|
||||
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
|
||||
scheme = "http"
|
||||
}
|
||||
oa.client.RedirectURL = fmt.Sprintf("%s://%s/oidc-callback", scheme, r.Host)
|
||||
|
||||
// Redirect user to consent page to ask for permission
|
||||
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier))
|
||||
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline,
|
||||
oauth2.S256ChallengeOption(codeVerifier),
|
||||
oidc.Nonce(nonce))
|
||||
http.Redirect(rw, r, url, http.StatusFound)
|
||||
}
|
||||
|
||||
@@ -15,37 +15,44 @@ var configSchema = `
|
||||
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
|
||||
"type": "string"
|
||||
},
|
||||
"cookieName": {
|
||||
"cookie-name": {
|
||||
"description": "Cookie that should be checked for a JWT token.",
|
||||
"type": "string"
|
||||
},
|
||||
"validateUser": {
|
||||
"validate-user": {
|
||||
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"trustedIssuer": {
|
||||
"trusted-issuer": {
|
||||
"description": "Issuer that should be accepted when validating external JWTs ",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"sync-user-on-login": {
|
||||
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"update-user-on-login": {
|
||||
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided in JWT.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["max-age"]
|
||||
},
|
||||
"oidc": {
|
||||
"provider": {
|
||||
"description": "",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
},
|
||||
"updateUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"provider": {
|
||||
"description": "OpenID Connect provider URL.",
|
||||
"type": "string"
|
||||
},
|
||||
"sync-user-on-login": {
|
||||
"description": "Add non-existent user to DB at login attempt with values provided.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"update-user-on-login": {
|
||||
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["provider"]
|
||||
},
|
||||
@@ -57,40 +64,48 @@ var configSchema = `
|
||||
"description": "URL of LDAP directory server.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_base": {
|
||||
"user-base": {
|
||||
"description": "Base DN of user tree root.",
|
||||
"type": "string"
|
||||
},
|
||||
"search_dn": {
|
||||
"search-dn": {
|
||||
"description": "DN for authenticating LDAP admin account with general read rights.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_bind": {
|
||||
"user-bind": {
|
||||
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_filter": {
|
||||
"user-filter": {
|
||||
"description": "Filter to extract users for syncing.",
|
||||
"type": "string"
|
||||
},
|
||||
"username_attr": {
|
||||
"username-attr": {
|
||||
"description": "Attribute with full username. Default: gecos",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_interval": {
|
||||
"sync-interval": {
|
||||
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_del_old_users": {
|
||||
"sync-del-old-users": {
|
||||
"description": "Delete obsolete users in database.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"uid-attr": {
|
||||
"description": "LDAP attribute used as login username. Default: uid",
|
||||
"type": "string"
|
||||
},
|
||||
"sync-user-on-login": {
|
||||
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
|
||||
"type": "boolean"
|
||||
},
|
||||
"update-user-on-login": {
|
||||
"description": "Should an existent user attributes in the DB be updated at login attempt with values from LDAP.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["url", "user_base", "search_dn", "user_bind", "user_filter"]
|
||||
"required": ["url", "user-base", "search-dn", "user-bind", "user-filter"]
|
||||
},
|
||||
"required": ["jwts"]
|
||||
}`
|
||||
|
||||
@@ -11,8 +11,8 @@ import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/resampler"
|
||||
)
|
||||
|
||||
type ProgramConfig struct {
|
||||
@@ -20,7 +20,9 @@ type ProgramConfig struct {
|
||||
Addr string `json:"addr"`
|
||||
|
||||
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
|
||||
APIAllowedIPs []string `json:"apiAllowedIPs"`
|
||||
APIAllowedIPs []string `json:"api-allowed-ips"`
|
||||
|
||||
APISubjects *NATSConfig `json:"api-subjects"`
|
||||
|
||||
// Drop root permissions once .env was read and the port was taken.
|
||||
User string `json:"user"`
|
||||
@@ -35,16 +37,9 @@ type ProgramConfig struct {
|
||||
EmbedStaticFiles bool `json:"embed-static-files"`
|
||||
StaticFiles string `json:"static-files"`
|
||||
|
||||
// 'sqlite3' or 'mysql' (mysql will work for mariadb as well)
|
||||
DBDriver string `json:"db-driver"`
|
||||
|
||||
// For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).
|
||||
// Path to SQLite database file
|
||||
DB string `json:"db"`
|
||||
|
||||
// Keep all metric data in the metric data repositories,
|
||||
// do not write to the job-archive.
|
||||
DisableArchive bool `json:"disable-archive"`
|
||||
|
||||
EnableJobTaggers bool `json:"enable-job-taggers"`
|
||||
|
||||
// Validate json input against schema
|
||||
@@ -76,17 +71,42 @@ type ProgramConfig struct {
|
||||
|
||||
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
|
||||
EnableResampling *ResampleConfig `json:"resampling"`
|
||||
|
||||
// Systemd unit name for log viewer (default: "clustercockpit")
|
||||
SystemdUnit string `json:"systemd-unit"`
|
||||
|
||||
// Node state retention configuration
|
||||
NodeStateRetention *NodeStateRetention `json:"nodestate-retention"`
|
||||
}
|
||||
|
||||
type NodeStateRetention struct {
|
||||
Policy string `json:"policy"` // "delete" or "move"
|
||||
Age int `json:"age"` // hours, default 24
|
||||
TargetKind string `json:"target-kind"` // "file" or "s3"
|
||||
TargetPath string `json:"target-path"`
|
||||
TargetEndpoint string `json:"target-endpoint"`
|
||||
TargetBucket string `json:"target-bucket"`
|
||||
TargetAccessKey string `json:"target-access-key"`
|
||||
TargetSecretKey string `json:"target-secret-key"`
|
||||
TargetRegion string `json:"target-region"`
|
||||
TargetUsePathStyle bool `json:"target-use-path-style"`
|
||||
MaxFileSizeMB int `json:"max-file-size-mb"`
|
||||
}
|
||||
|
||||
type ResampleConfig struct {
|
||||
// Minimum number of points to trigger resampling of data
|
||||
MinimumPoints int `json:"minimumPoints"`
|
||||
MinimumPoints int `json:"minimum-points"`
|
||||
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
|
||||
Resolutions []int `json:"resolutions"`
|
||||
// Trigger next zoom level at less than this many visible datapoints
|
||||
Trigger int `json:"trigger"`
|
||||
}
|
||||
|
||||
type NATSConfig struct {
|
||||
SubjectJobEvent string `json:"subject-job-event"`
|
||||
SubjectNodeState string `json:"subject-node-state"`
|
||||
}
|
||||
|
||||
type IntRange struct {
|
||||
From int `json:"from"`
|
||||
To int `json:"to"`
|
||||
@@ -100,32 +120,20 @@ type TimeRange struct {
|
||||
|
||||
type FilterRanges struct {
|
||||
Duration *IntRange `json:"duration"`
|
||||
NumNodes *IntRange `json:"numNodes"`
|
||||
StartTime *TimeRange `json:"startTime"`
|
||||
NumNodes *IntRange `json:"num-nodes"`
|
||||
StartTime *TimeRange `json:"start-time"`
|
||||
}
|
||||
|
||||
type ClusterConfig struct {
|
||||
Name string `json:"name"`
|
||||
FilterRanges *FilterRanges `json:"filterRanges"`
|
||||
MetricDataRepository json.RawMessage `json:"metricDataRepository"`
|
||||
}
|
||||
|
||||
var Clusters []*ClusterConfig
|
||||
|
||||
var Keys ProgramConfig = ProgramConfig{
|
||||
Addr: "localhost:8080",
|
||||
DisableAuthentication: false,
|
||||
EmbedStaticFiles: true,
|
||||
DBDriver: "sqlite3",
|
||||
DB: "./var/job.db",
|
||||
DisableArchive: false,
|
||||
Validate: false,
|
||||
SessionMaxAge: "168h",
|
||||
StopJobsExceedingWalltime: 0,
|
||||
ShortRunningJobsDuration: 5 * 60,
|
||||
}
|
||||
|
||||
func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
|
||||
func Init(mainConfig json.RawMessage) {
|
||||
Validate(configSchema, mainConfig)
|
||||
dec := json.NewDecoder(bytes.NewReader(mainConfig))
|
||||
dec.DisallowUnknownFields()
|
||||
@@ -133,17 +141,6 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
|
||||
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
|
||||
}
|
||||
|
||||
Validate(clustersSchema, clusterConfig)
|
||||
dec = json.NewDecoder(bytes.NewReader(clusterConfig))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Clusters); err != nil {
|
||||
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
|
||||
}
|
||||
|
||||
if len(Clusters) < 1 {
|
||||
cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
|
||||
}
|
||||
|
||||
if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
|
||||
resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
|
||||
}
|
||||
|
||||
@@ -8,19 +8,15 @@ package config
|
||||
import (
|
||||
"testing"
|
||||
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
)
|
||||
|
||||
func TestInit(t *testing.T) {
|
||||
fp := "../../configs/config.json"
|
||||
ccconf.Init(fp)
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
Init(cfg)
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
@@ -34,11 +30,7 @@ func TestInitMinimal(t *testing.T) {
|
||||
fp := "../../configs/config-demo.json"
|
||||
ccconf.Init(fp)
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
Init(cfg)
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ import (
|
||||
|
||||
type DefaultMetricsCluster struct {
|
||||
Name string `json:"name"`
|
||||
DefaultMetrics string `json:"default_metrics"`
|
||||
DefaultMetrics string `json:"default-metrics"`
|
||||
}
|
||||
|
||||
type DefaultMetricsConfig struct {
|
||||
|
||||
@@ -6,14 +6,14 @@
|
||||
package config
|
||||
|
||||
var configSchema = `
|
||||
{
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"addr": {
|
||||
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
|
||||
"type": "string"
|
||||
},
|
||||
"apiAllowedIPs": {
|
||||
"api-allowed-ips": {
|
||||
"description": "Addresses from which secured API endpoints can be reached",
|
||||
"type": "array",
|
||||
"items": {
|
||||
@@ -41,13 +41,9 @@ var configSchema = `
|
||||
"type": "string"
|
||||
},
|
||||
"db": {
|
||||
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
|
||||
"description": "Path to SQLite database file (e.g., './var/job.db')",
|
||||
"type": "string"
|
||||
},
|
||||
"disable-archive": {
|
||||
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"enable-job-taggers": {
|
||||
"description": "Turn on automatic application and jobclass taggers",
|
||||
"type": "boolean"
|
||||
@@ -81,28 +77,22 @@ var configSchema = `
|
||||
"type": "integer"
|
||||
},
|
||||
"emission-constant": {
|
||||
"description": ".",
|
||||
"description": "Energy mix CO2 emission constant [g/kWh]. If set, displays estimated CO2 emission for jobs.",
|
||||
"type": "integer"
|
||||
},
|
||||
"cron-frequency": {
|
||||
"description": "Frequency of cron job workers.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"duration-worker": {
|
||||
"description": "Duration Update Worker [Defaults to '5m']",
|
||||
"type": "string"
|
||||
},
|
||||
"footprint-worker": {
|
||||
"description": "Metric-Footprint Update Worker [Defaults to '10m']",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
"machine-state-dir": {
|
||||
"description": "Where to store MachineState files.",
|
||||
"type": "string"
|
||||
},
|
||||
"enable-resampling": {
|
||||
"systemd-unit": {
|
||||
"description": "Systemd unit name for log viewer (default: 'clustercockpit').",
|
||||
"type": "string"
|
||||
},
|
||||
"resampling": {
|
||||
"description": "Enable dynamic zoom in frontend metric plots.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"minimumPoints": {
|
||||
"minimum-points": {
|
||||
"description": "Minimum points to trigger resampling of time-series data.",
|
||||
"type": "integer"
|
||||
},
|
||||
@@ -119,87 +109,74 @@ var configSchema = `
|
||||
}
|
||||
},
|
||||
"required": ["trigger", "resolutions"]
|
||||
}
|
||||
},
|
||||
"required": ["apiAllowedIPs"]
|
||||
}`
|
||||
|
||||
var clustersSchema = `
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
},
|
||||
"api-subjects": {
|
||||
"description": "NATS subjects configuration for subscribing to job and node events.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "The name of the cluster.",
|
||||
"subject-job-event": {
|
||||
"description": "NATS subject for job events (start_job, stop_job)",
|
||||
"type": "string"
|
||||
},
|
||||
"metricDataRepository": {
|
||||
"description": "Type of the metric data repository for this cluster",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
|
||||
},
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"token": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["kind"]
|
||||
},
|
||||
"filterRanges": {
|
||||
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"numNodes": {
|
||||
"description": "UI slider range for number of nodes",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
},
|
||||
"duration": {
|
||||
"description": "UI slider range for duration",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
},
|
||||
"startTime": {
|
||||
"description": "UI slider range for start time",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"to": {
|
||||
"type": "null"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
}
|
||||
},
|
||||
"required": ["numNodes", "duration", "startTime"]
|
||||
"subject-node-state": {
|
||||
"description": "NATS subject for node state updates",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["name", "metricDataRepository", "filterRanges"],
|
||||
"minItems": 1
|
||||
"required": ["subject-job-event", "subject-node-state"]
|
||||
},
|
||||
"nodestate-retention": {
|
||||
"description": "Node state retention configuration for cleaning up old node_state rows.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"policy": {
|
||||
"description": "Retention policy: 'delete' to remove old rows, 'move' to archive to Parquet then delete.",
|
||||
"type": "string",
|
||||
"enum": ["delete", "move"]
|
||||
},
|
||||
"age": {
|
||||
"description": "Retention age in hours (default: 24).",
|
||||
"type": "integer"
|
||||
},
|
||||
"target-kind": {
|
||||
"description": "Target kind for parquet archiving: 'file' or 's3'.",
|
||||
"type": "string",
|
||||
"enum": ["file", "s3"]
|
||||
},
|
||||
"target-path": {
|
||||
"description": "Filesystem path for parquet file target.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-endpoint": {
|
||||
"description": "S3 endpoint URL.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-bucket": {
|
||||
"description": "S3 bucket name.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-access-key": {
|
||||
"description": "S3 access key.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-secret-key": {
|
||||
"description": "S3 secret key.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-region": {
|
||||
"description": "S3 region.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-use-path-style": {
|
||||
"description": "Use path-style S3 addressing.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"max-file-size-mb": {
|
||||
"description": "Maximum parquet file size in MB (default: 128).",
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["policy"]
|
||||
}
|
||||
}`
|
||||
}
|
||||
}`
|
||||
|
||||
@@ -8,7 +8,7 @@ package config
|
||||
import (
|
||||
"encoding/json"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/santhosh-tekuri/jsonschema/v5"
|
||||
)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -10,9 +10,21 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
type ClusterMetricWithName struct {
|
||||
Name string `json:"name"`
|
||||
Unit *schema.Unit `json:"unit,omitempty"`
|
||||
Timestep int `json:"timestep"`
|
||||
Data []schema.Float `json:"data"`
|
||||
}
|
||||
|
||||
type ClusterMetrics struct {
|
||||
NodeCount int `json:"nodeCount"`
|
||||
Metrics []*ClusterMetricWithName `json:"metrics"`
|
||||
}
|
||||
|
||||
type Count struct {
|
||||
Name string `json:"name"`
|
||||
Count int `json:"count"`
|
||||
@@ -59,6 +71,7 @@ type JobFilter struct {
|
||||
Project *StringInput `json:"project,omitempty"`
|
||||
JobName *StringInput `json:"jobName,omitempty"`
|
||||
Cluster *StringInput `json:"cluster,omitempty"`
|
||||
SubCluster *StringInput `json:"subCluster,omitempty"`
|
||||
Partition *StringInput `json:"partition,omitempty"`
|
||||
Duration *config.IntRange `json:"duration,omitempty"`
|
||||
Energy *FloatRange `json:"energy,omitempty"`
|
||||
@@ -70,6 +83,7 @@ type JobFilter struct {
|
||||
State []schema.JobState `json:"state,omitempty"`
|
||||
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
|
||||
Shared *string `json:"shared,omitempty"`
|
||||
Schedule *string `json:"schedule,omitempty"`
|
||||
Node *StringInput `json:"node,omitempty"`
|
||||
}
|
||||
|
||||
@@ -173,7 +187,7 @@ type NamedStatsWithScope struct {
|
||||
type NodeFilter struct {
|
||||
Hostname *StringInput `json:"hostname,omitempty"`
|
||||
Cluster *StringInput `json:"cluster,omitempty"`
|
||||
Subcluster *StringInput `json:"subcluster,omitempty"`
|
||||
SubCluster *StringInput `json:"subCluster,omitempty"`
|
||||
SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"`
|
||||
HealthState *string `json:"healthState,omitempty"`
|
||||
TimeStart *int `json:"timeStart,omitempty"`
|
||||
|
||||
@@ -4,7 +4,7 @@ import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
package graph
|
||||
|
||||
// This file will be automatically regenerated based on the schema, any resolver implementations
|
||||
// This file will be automatically regenerated based on the schema, any resolver
|
||||
// implementations
|
||||
// will be copied through when generating and any unknown code will be moved to the end.
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.81
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.85
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
@@ -17,11 +19,12 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
ccunit "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// Partitions is the resolver for the partitions field.
|
||||
@@ -86,14 +89,14 @@ func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*
|
||||
res := []*model.EnergyFootprintValue{}
|
||||
for name, value := range rawEnergyFootprint {
|
||||
// Suboptimal: Nearly hardcoded metric name expectations
|
||||
matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`)
|
||||
matchCPU := regexp.MustCompile(`cpu|Cpu|CPU`)
|
||||
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
|
||||
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
|
||||
matchCore := regexp.MustCompile(`core|Core|CORE`)
|
||||
|
||||
hwType := ""
|
||||
switch test := name; { // NOtice ';' for var declaration
|
||||
case matchCpu.MatchString(test):
|
||||
case matchCPU.MatchString(test):
|
||||
hwType = "CPU"
|
||||
case matchAcc.MatchString(test):
|
||||
hwType = "Accelerator"
|
||||
@@ -173,9 +176,9 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
for _, tagID := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
tid, err := strconv.ParseInt(tagID, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing tag id")
|
||||
return nil, err
|
||||
@@ -220,9 +223,9 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
for _, tagID := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
tid, err := strconv.ParseInt(tagID, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing tag id")
|
||||
return nil, err
|
||||
@@ -263,9 +266,9 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
|
||||
}
|
||||
|
||||
tags := []int{}
|
||||
for _, tagId := range tagIds {
|
||||
for _, tagID := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
tid, err := strconv.ParseInt(tagID, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing tag id for removal")
|
||||
return nil, err
|
||||
@@ -281,7 +284,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
|
||||
// Test Access: Admins && Admin Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
|
||||
// Remove from DB
|
||||
if err = r.Repo.RemoveTagById(tid); err != nil {
|
||||
if err = r.Repo.RemoveTagByID(tid); err != nil {
|
||||
cclog.Warn("Error while removing tag")
|
||||
return nil, err
|
||||
} else {
|
||||
@@ -315,18 +318,39 @@ func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (sc
|
||||
if obj.NodeState != "" {
|
||||
return obj.NodeState, nil
|
||||
} else {
|
||||
return "", fmt.Errorf("No SchedulerState (NodeState) on Object")
|
||||
return "", fmt.Errorf("resolver: no SchedulerState (NodeState) on node object")
|
||||
}
|
||||
}
|
||||
|
||||
// HealthState is the resolver for the healthState field.
|
||||
func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (string, error) {
|
||||
panic(fmt.Errorf("not implemented: HealthState - healthState"))
|
||||
if obj.HealthState != "" {
|
||||
return string(obj.HealthState), nil
|
||||
} else {
|
||||
return "", fmt.Errorf("resolver: no HealthState (NodeState) on node object")
|
||||
}
|
||||
}
|
||||
|
||||
// MetaData is the resolver for the metaData field.
|
||||
func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) {
|
||||
panic(fmt.Errorf("not implemented: MetaData - metaData"))
|
||||
if obj.MetaData != nil {
|
||||
return obj.MetaData, nil
|
||||
} else {
|
||||
cclog.Debug("resolver: no MetaData (NodeState) on node object")
|
||||
emptyMeta := make(map[string]string, 0)
|
||||
return emptyMeta, nil
|
||||
}
|
||||
}
|
||||
|
||||
// HealthData is the resolver for the healthData field.
|
||||
func (r *nodeResolver) HealthData(ctx context.Context, obj *schema.Node) (any, error) {
|
||||
if obj.HealthData != nil {
|
||||
return obj.HealthData, nil
|
||||
} else {
|
||||
cclog.Debug("resolver: no HealthData (NodeState) on node object")
|
||||
emptyHealth := make(map[string][]string, 0)
|
||||
return emptyHealth, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Clusters is the resolver for the clusters field.
|
||||
@@ -341,6 +365,14 @@ func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
|
||||
|
||||
// GlobalMetrics is the resolver for the globalMetrics field.
|
||||
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
|
||||
if user != nil {
|
||||
if user.HasRole(schema.RoleUser) || user.HasRole(schema.RoleManager) {
|
||||
return archive.GlobalUserMetricList, nil
|
||||
}
|
||||
}
|
||||
|
||||
return archive.GlobalMetricList, nil
|
||||
}
|
||||
|
||||
@@ -371,12 +403,12 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
|
||||
// Node is the resolver for the node field.
|
||||
func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
numericId, err := strconv.ParseInt(id, 10, 64)
|
||||
numericID, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
return repo.GetNodeById(numericId, false)
|
||||
return repo.GetNodeByID(numericID, false)
|
||||
}
|
||||
|
||||
// Nodes is the resolver for the nodes field.
|
||||
@@ -387,6 +419,15 @@ func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, o
|
||||
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
|
||||
}
|
||||
|
||||
// NodesWithMeta is the resolver for the nodesWithMeta field.
|
||||
func (r *queryResolver) NodesWithMeta(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
|
||||
// Why Extra Handler? -> graphql.CollectAllFields(ctx) only returns toplevel fields (i.e.: items, count), and not subfields like item.metaData
|
||||
repo := repository.GetNodeRepository()
|
||||
nodes, err := repo.QueryNodesWithMeta(ctx, filter, nil, order) // Ignore Paging, Order Unused
|
||||
count := len(nodes)
|
||||
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
|
||||
}
|
||||
|
||||
// NodeStates is the resolver for the nodeStates field.
|
||||
func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
@@ -403,8 +444,7 @@ func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilt
|
||||
return nil, herr
|
||||
}
|
||||
|
||||
allCounts := make([]*model.NodeStates, 0)
|
||||
allCounts = append(stateCounts, healthCounts...)
|
||||
allCounts := append(stateCounts, healthCounts...)
|
||||
|
||||
return allCounts, nil
|
||||
}
|
||||
@@ -431,18 +471,18 @@ func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.Nod
|
||||
return healthCounts, nil
|
||||
}
|
||||
|
||||
return nil, errors.New("Unknown Node State Query Type")
|
||||
return nil, errors.New("unknown Node State Query Type")
|
||||
}
|
||||
|
||||
// Job is the resolver for the job field.
|
||||
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
|
||||
numericId, err := strconv.ParseInt(id, 10, 64)
|
||||
numericID, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job, err := r.Repo.FindById(ctx, numericId)
|
||||
job, err := r.Repo.FindByID(ctx, numericID)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while finding job by id")
|
||||
return nil, err
|
||||
@@ -475,7 +515,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
|
||||
data, err := metricdispatch.LoadData(job, metrics, scopes, ctx, *resolution)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while loading job data")
|
||||
return nil, err
|
||||
@@ -503,7 +543,7 @@ func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []strin
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
|
||||
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading jobStats data for job id %s", id)
|
||||
return nil, err
|
||||
@@ -528,7 +568,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadScopedJobStats(job, metrics, scopes, ctx)
|
||||
data, err := metricdispatch.LoadScopedJobStats(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
|
||||
return nil, err
|
||||
@@ -542,7 +582,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
|
||||
for _, stat := range stats {
|
||||
mdlStats = append(mdlStats, &model.ScopedStats{
|
||||
Hostname: stat.Hostname,
|
||||
ID: stat.Id,
|
||||
ID: stat.ID,
|
||||
Data: stat.Data,
|
||||
})
|
||||
}
|
||||
@@ -581,21 +621,24 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
|
||||
|
||||
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
|
||||
// Users can decide in frontend to use continuous scroll, even if app-default is paging!
|
||||
// Skip if page.ItemsPerPage == -1 ("Load All" -> No Next Page required, Status Dashboards)
|
||||
/*
|
||||
Example Page 4 @ 10 IpP : Does item 41 exist?
|
||||
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
|
||||
*/
|
||||
nextPage := &model.PageRequest{
|
||||
ItemsPerPage: 1,
|
||||
Page: ((page.Page * page.ItemsPerPage) + 1),
|
||||
hasNextPage := false
|
||||
if page.ItemsPerPage != -1 {
|
||||
nextPage := &model.PageRequest{
|
||||
ItemsPerPage: 1,
|
||||
Page: ((page.Page * page.ItemsPerPage) + 1),
|
||||
}
|
||||
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying next jobs")
|
||||
return nil, err
|
||||
}
|
||||
hasNextPage = len(nextJobs) == 1
|
||||
}
|
||||
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying next jobs")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
hasNextPage := len(nextJobs) == 1
|
||||
|
||||
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
|
||||
}
|
||||
@@ -693,7 +736,7 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job
|
||||
|
||||
res := []*model.JobStats{}
|
||||
for _, job := range jobs {
|
||||
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
|
||||
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
|
||||
continue
|
||||
@@ -744,13 +787,19 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
defaultMetrics := make([]string, 0)
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
defaultMetrics = append(defaultMetrics, mc.Name)
|
||||
}
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
metrics = defaultMetrics
|
||||
} else {
|
||||
metrics = slices.DeleteFunc(metrics, func(metric string) bool {
|
||||
return !slices.Contains(defaultMetrics, metric) // Remove undefined metrics.
|
||||
})
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
data, err := metricdispatch.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
cclog.Warn("error while loading node data")
|
||||
return nil, err
|
||||
@@ -804,153 +853,39 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
nodeRepo := repository.GetNodeRepository()
|
||||
// nodes -> array hostname
|
||||
nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page)
|
||||
if nerr != nil {
|
||||
return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// Build Filters
|
||||
queryFilters := make([]*model.NodeFilter, 0)
|
||||
if cluster != "" {
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{Cluster: &model.StringInput{Eq: &cluster}})
|
||||
}
|
||||
if subCluster != "" {
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{Subcluster: &model.StringInput{Eq: &subCluster}})
|
||||
}
|
||||
if nodeFilter != "" && stateFilter != "notindb" {
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}})
|
||||
}
|
||||
if stateFilter != "all" && stateFilter != "notindb" {
|
||||
var queryState schema.SchedulerState = schema.SchedulerState(stateFilter)
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState})
|
||||
}
|
||||
// if healthFilter != "all" {
|
||||
// filters = append(filters, &model.NodeFilter{HealthState: &healthFilter})
|
||||
// }
|
||||
|
||||
// Special Case: Disable Paging for missing nodes filter, save IPP for later
|
||||
var backupItems int
|
||||
if stateFilter == "notindb" {
|
||||
backupItems = page.ItemsPerPage
|
||||
page.ItemsPerPage = -1
|
||||
}
|
||||
|
||||
// Query Nodes From DB
|
||||
nodeRepo := repository.GetNodeRepository()
|
||||
rawNodes, serr := nodeRepo.QueryNodes(ctx, queryFilters, page, nil) // Order not Used
|
||||
if serr != nil {
|
||||
cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)")
|
||||
return nil, serr
|
||||
}
|
||||
|
||||
// Intermediate Node Result Info
|
||||
nodes := make([]string, 0)
|
||||
stateMap := make(map[string]string)
|
||||
for _, node := range rawNodes {
|
||||
nodes = append(nodes, node.Hostname)
|
||||
stateMap[node.Hostname] = string(node.NodeState)
|
||||
}
|
||||
|
||||
// Setup Vars
|
||||
var countNodes int
|
||||
var cerr error
|
||||
var hasNextPage bool
|
||||
|
||||
// Special Case: Find Nodes not in DB node table but in metricStore only
|
||||
if stateFilter == "notindb" {
|
||||
// Reapply Original Paging
|
||||
page.ItemsPerPage = backupItems
|
||||
// Get Nodes From Topology
|
||||
var topoNodes []string
|
||||
if subCluster != "" {
|
||||
scNodes := archive.NodeLists[cluster][subCluster]
|
||||
topoNodes = scNodes.PrintList()
|
||||
} else {
|
||||
subClusterNodeLists := archive.NodeLists[cluster]
|
||||
for _, nodeList := range subClusterNodeLists {
|
||||
topoNodes = append(topoNodes, nodeList.PrintList()...)
|
||||
}
|
||||
}
|
||||
// Compare to all nodes from cluster/subcluster in DB
|
||||
var missingNodes []string
|
||||
for _, scanNode := range topoNodes {
|
||||
if !slices.Contains(nodes, scanNode) {
|
||||
missingNodes = append(missingNodes, scanNode)
|
||||
}
|
||||
}
|
||||
// Filter nodes by name
|
||||
if nodeFilter != "" {
|
||||
filteredNodesByName := []string{}
|
||||
for _, missingNode := range missingNodes {
|
||||
if strings.Contains(missingNode, nodeFilter) {
|
||||
filteredNodesByName = append(filteredNodesByName, missingNode)
|
||||
}
|
||||
}
|
||||
missingNodes = filteredNodesByName
|
||||
}
|
||||
// Sort Missing Nodes Alphanumerically
|
||||
slices.Sort(missingNodes)
|
||||
// Total Missing
|
||||
countNodes = len(missingNodes)
|
||||
// Apply paging
|
||||
if countNodes > page.ItemsPerPage {
|
||||
start := (page.Page - 1) * page.ItemsPerPage
|
||||
end := start + page.ItemsPerPage
|
||||
if end > countNodes {
|
||||
end = countNodes
|
||||
hasNextPage = false
|
||||
} else {
|
||||
hasNextPage = true
|
||||
}
|
||||
nodes = missingNodes[start:end]
|
||||
} else {
|
||||
nodes = missingNodes
|
||||
}
|
||||
|
||||
} else {
|
||||
// DB Nodes: Count and Find Next Page
|
||||
countNodes, cerr = nodeRepo.CountNodes(ctx, queryFilters)
|
||||
if cerr != nil {
|
||||
cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)")
|
||||
return nil, cerr
|
||||
}
|
||||
|
||||
// Example Page 4 @ 10 IpP : Does item 41 exist?
|
||||
// Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 exists.
|
||||
nextPage := &model.PageRequest{
|
||||
ItemsPerPage: 1,
|
||||
Page: ((page.Page * page.ItemsPerPage) + 1),
|
||||
}
|
||||
nextNodes, err := nodeRepo.QueryNodes(ctx, queryFilters, nextPage, nil) // Order not Used
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying next nodes")
|
||||
return nil, err
|
||||
}
|
||||
hasNextPage = len(nextNodes) == 1
|
||||
}
|
||||
|
||||
// Load Metric Data For Specified Nodes Only
|
||||
data, err := metricDataDispatcher.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
|
||||
// data -> map hostname:jobdata
|
||||
data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
|
||||
if err != nil {
|
||||
cclog.Warn("error while loading node data (Resolver.NodeMetricsList")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Build Result
|
||||
nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
|
||||
for hostname, metrics := range data {
|
||||
for _, hostname := range nodes {
|
||||
host := &model.NodeMetrics{
|
||||
Host: hostname,
|
||||
State: stateMap[hostname],
|
||||
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
|
||||
Metrics: make([]*model.JobMetricWithName, 0),
|
||||
}
|
||||
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
|
||||
if err != nil {
|
||||
cclog.Warnf("error in nodeMetrics resolver: %s", err)
|
||||
}
|
||||
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for metric, scopedMetrics := range data[hostname] {
|
||||
for scope, scopedMetric := range scopedMetrics {
|
||||
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
|
||||
Name: metric,
|
||||
@@ -963,9 +898,9 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
|
||||
nodeMetricsList = append(nodeMetricsList, host)
|
||||
}
|
||||
|
||||
// Final Return
|
||||
nodeMetricsListResult := &model.NodesResultList{
|
||||
Items: nodeMetricsList,
|
||||
Items: nodeMetricsList,
|
||||
// TotalNodes depends on sum of nodes grouped on latest timestamp, see repo/node.go:357
|
||||
TotalNodes: &countNodes,
|
||||
HasNextPage: &hasNextPage,
|
||||
}
|
||||
@@ -973,6 +908,99 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
|
||||
return nodeMetricsListResult, nil
|
||||
}
|
||||
|
||||
// ClusterMetrics is the resolver for the clusterMetrics field.
|
||||
func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metrics []string, from time.Time, to time.Time) (*model.ClusterMetrics, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow
|
||||
scopes := []schema.MetricScope{"node"}
|
||||
data, err := metricdispatch.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
cclog.Warn("error while loading node data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
clusterMetricData := make([]*model.ClusterMetricWithName, 0)
|
||||
clusterMetrics := model.ClusterMetrics{NodeCount: 0, Metrics: clusterMetricData}
|
||||
|
||||
collectorTimestep := make(map[string]int)
|
||||
collectorUnit := make(map[string]schema.Unit)
|
||||
collectorData := make(map[string][]schema.Float)
|
||||
|
||||
for _, metrics := range data {
|
||||
clusterMetrics.NodeCount += 1
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for _, scopedMetric := range scopedMetrics {
|
||||
// Collect Info Once
|
||||
_, okTimestep := collectorTimestep[metric]
|
||||
if !okTimestep {
|
||||
collectorTimestep[metric] = scopedMetric.Timestep
|
||||
}
|
||||
_, okUnit := collectorUnit[metric]
|
||||
if !okUnit {
|
||||
collectorUnit[metric] = scopedMetric.Unit
|
||||
}
|
||||
// Collect Data
|
||||
for _, ser := range scopedMetric.Series {
|
||||
_, okData := collectorData[metric]
|
||||
// Init With Datasize > 0
|
||||
if !okData && len(ser.Data) != 0 {
|
||||
collectorData[metric] = make([]schema.Float, len(ser.Data))
|
||||
} else if !okData {
|
||||
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data))
|
||||
}
|
||||
// Sum if init'd and matching size
|
||||
if okData && len(ser.Data) == len(collectorData[metric]) {
|
||||
for i, val := range ser.Data {
|
||||
if val.IsNaN() {
|
||||
continue
|
||||
} else {
|
||||
collectorData[metric][i] += val
|
||||
}
|
||||
}
|
||||
} else if okData {
|
||||
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for metricName, data := range collectorData {
|
||||
// use ccUnits for backend normalization to "Tera"
|
||||
p_old := ccunit.NewPrefix(collectorUnit[metricName].Prefix)
|
||||
p_new := ccunit.NewPrefix("T")
|
||||
convFunc := ccunit.GetPrefixPrefixFactor(p_old, p_new)
|
||||
u_new := schema.Unit{Prefix: p_new.Prefix(), Base: collectorUnit[metricName].Base}
|
||||
|
||||
roundedData := make([]schema.Float, 0)
|
||||
for _, v_old := range data {
|
||||
v_new := math.Round(convFunc(float64(v_old)).(float64)*100.0) / 100.0
|
||||
roundedData = append(roundedData, schema.Float(v_new))
|
||||
}
|
||||
|
||||
cm := model.ClusterMetricWithName{
|
||||
Name: metricName,
|
||||
Unit: &u_new,
|
||||
Timestep: collectorTimestep[metricName],
|
||||
Data: roundedData,
|
||||
}
|
||||
|
||||
clusterMetrics.Metrics = append(clusterMetrics.Metrics, &cm)
|
||||
}
|
||||
|
||||
return &clusterMetrics, nil
|
||||
}
|
||||
|
||||
// NumberOfNodes is the resolver for the numberOfNodes field.
|
||||
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
|
||||
nodeList, err := archive.ParseNodeList(obj.Nodes)
|
||||
|
||||
@@ -2,18 +2,20 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package graph
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
const MAX_JOBS_FOR_ANALYSIS = 500
|
||||
@@ -53,15 +55,15 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
// resolution = max(resolution, mc.Timestep)
|
||||
// }
|
||||
|
||||
jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
|
||||
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
|
||||
cclog.Warnf("Error while loading roofline metrics for job %d", *job.ID)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
|
||||
if flops_ == nil && membw_ == nil {
|
||||
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", *job.ID)
|
||||
continue
|
||||
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
}
|
||||
@@ -126,7 +128,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
||||
continue
|
||||
}
|
||||
|
||||
if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
cclog.Error("Error while loading averages for footprint")
|
||||
return nil, err
|
||||
}
|
||||
@@ -185,11 +187,5 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
||||
func requireField(ctx context.Context, name string) bool {
|
||||
fields := graphql.CollectAllFields(ctx)
|
||||
|
||||
for _, f := range fields {
|
||||
if f == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
return slices.Contains(fields, name)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package importer
|
||||
|
||||
import (
|
||||
@@ -14,8 +15,8 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.
|
||||
@@ -37,7 +38,7 @@ import (
|
||||
func HandleImportFlag(flag string) error {
|
||||
r := repository.GetJobRepository()
|
||||
|
||||
for _, pair := range strings.Split(flag, ",") {
|
||||
for pair := range strings.SplitSeq(flag, ",") {
|
||||
files := strings.Split(pair, ":")
|
||||
if len(files) != 2 {
|
||||
return fmt.Errorf("REPOSITORY/INIT > invalid import flag format")
|
||||
@@ -101,7 +102,7 @@ func HandleImportFlag(flag string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
id, err := r.InsertJob(&job)
|
||||
id, err := r.InsertJobDirect(&job)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while job db insert")
|
||||
return err
|
||||
|
||||
@@ -16,8 +16,8 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
)
|
||||
|
||||
// copyFile copies a file from source path to destination path.
|
||||
@@ -50,42 +50,14 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": [
|
||||
"api-allowed-ips": [
|
||||
"*"
|
||||
]},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "testcluster",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 64 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 944 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "taurus",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 4000 },
|
||||
"duration": { "from": 0, "to": 604800 },
|
||||
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
|
||||
}
|
||||
}
|
||||
]}`
|
||||
}
|
||||
}`
|
||||
|
||||
cclog.Init("info", true)
|
||||
tmpdir := t.TempDir()
|
||||
@@ -107,7 +79,7 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
}
|
||||
|
||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||
err := repository.MigrateDB("sqlite3", dbfilepath)
|
||||
err := repository.MigrateDB(dbfilepath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -121,22 +93,18 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
t.Fatal("Cluster configuration must be present")
|
||||
}
|
||||
config.Init(cfg)
|
||||
} else {
|
||||
t.Fatal("Main configuration must be present")
|
||||
}
|
||||
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
repository.Connect("sqlite3", dbfilepath)
|
||||
repository.Connect(dbfilepath)
|
||||
return repository.GetJobRepository()
|
||||
}
|
||||
|
||||
@@ -197,7 +165,7 @@ func TestHandleImportFlag(t *testing.T) {
|
||||
}
|
||||
|
||||
result := readResult(t, testname)
|
||||
job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime)
|
||||
job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -22,8 +22,8 @@ import (
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -111,18 +111,22 @@ func InitDB() error {
|
||||
continue
|
||||
}
|
||||
|
||||
id, err := r.TransactionAddNamed(t,
|
||||
id, jobErr := r.TransactionAddNamed(t,
|
||||
repository.NamedJobInsert, jobMeta)
|
||||
if err != nil {
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
if jobErr != nil {
|
||||
cclog.Errorf("repository initDB(): %v", jobErr)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
// Job successfully inserted, increment counter
|
||||
i += 1
|
||||
|
||||
for _, tag := range jobMeta.Tags {
|
||||
tagstr := tag.Name + ":" + tag.Type
|
||||
tagID, ok := tags[tagstr]
|
||||
if !ok {
|
||||
var err error
|
||||
tagID, err = r.TransactionAdd(t,
|
||||
addTagQuery,
|
||||
tag.Name, tag.Type)
|
||||
@@ -138,10 +142,6 @@ func InitDB() error {
|
||||
setTagQuery,
|
||||
id, tagID)
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
i += 1
|
||||
}
|
||||
}
|
||||
|
||||
if errorOccured > 0 {
|
||||
@@ -216,7 +216,7 @@ func enrichJobMetadata(job *schema.Job) error {
|
||||
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
|
||||
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, *job.ID)
|
||||
}
|
||||
|
||||
job.EnergyFootprint[fp] = metricEnergy
|
||||
@@ -225,7 +225,7 @@ func enrichJobMetadata(job *schema.Job) error {
|
||||
|
||||
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
|
||||
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
|
||||
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
|
||||
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", *job.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package importer
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
|
||||
)
|
||||
|
||||
// getNormalizationFactor calculates the scaling factor needed to normalize a value
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
|
||||
)
|
||||
|
||||
// TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix.
|
||||
|
||||
@@ -1,482 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/linkedin/goavro/v2"
|
||||
)
|
||||
|
||||
var NumAvroWorkers int = 4
|
||||
var startUp bool = true
|
||||
var ErrNoNewData error = errors.New("no data in the pool")
|
||||
|
||||
func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
|
||||
levels := make([]*AvroLevel, 0)
|
||||
selectors := make([][]string, 0)
|
||||
as.root.lock.RLock()
|
||||
// Cluster
|
||||
for sel1, l1 := range as.root.children {
|
||||
l1.lock.RLock()
|
||||
// Node
|
||||
for sel2, l2 := range l1.children {
|
||||
l2.lock.RLock()
|
||||
// Frequency
|
||||
for sel3, l3 := range l2.children {
|
||||
levels = append(levels, l3)
|
||||
selectors = append(selectors, []string{sel1, sel2, sel3})
|
||||
}
|
||||
l2.lock.RUnlock()
|
||||
}
|
||||
l1.lock.RUnlock()
|
||||
}
|
||||
as.root.lock.RUnlock()
|
||||
|
||||
type workItem struct {
|
||||
level *AvroLevel
|
||||
dir string
|
||||
selector []string
|
||||
}
|
||||
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(NumAvroWorkers)
|
||||
work := make(chan workItem, NumAvroWorkers*2)
|
||||
for range NumAvroWorkers {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
for workItem := range work {
|
||||
from := getTimestamp(workItem.dir)
|
||||
|
||||
if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil {
|
||||
if err == ErrNoNewArchiveData {
|
||||
continue
|
||||
}
|
||||
|
||||
cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
} else {
|
||||
atomic.AddInt32(&n, 1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for i := range len(levels) {
|
||||
dir := path.Join(dir, path.Join(selectors[i]...))
|
||||
work <- workItem{
|
||||
level: levels[i],
|
||||
dir: dir,
|
||||
selector: selectors[i],
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
|
||||
startUp = false
|
||||
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// getTimestamp returns the timestamp from the directory name
|
||||
func getTimestamp(dir string) int64 {
|
||||
// Extract the resolution and timestamp from the directory name
|
||||
// The existing avro file will be in epoch timestamp format
|
||||
// iterate over all the files in the directory and find the maximum timestamp
|
||||
// and return it
|
||||
|
||||
resolution := path.Base(dir)
|
||||
dir = path.Dir(dir)
|
||||
|
||||
files, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
var maxTS int64 = 0
|
||||
|
||||
if len(files) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
if file.IsDir() {
|
||||
continue
|
||||
}
|
||||
name := file.Name()
|
||||
|
||||
if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") {
|
||||
continue
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64)
|
||||
if err != nil {
|
||||
fmt.Printf("error while parsing timestamp: %s\n", err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
if ts > maxTS {
|
||||
maxTS = ts
|
||||
}
|
||||
}
|
||||
|
||||
interval, _ := time.ParseDuration(Keys.Checkpoints.Interval)
|
||||
updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix()
|
||||
|
||||
if startUp {
|
||||
return 0
|
||||
}
|
||||
|
||||
if updateTime < time.Now().Unix() {
|
||||
return 0
|
||||
}
|
||||
|
||||
return maxTS
|
||||
}
|
||||
|
||||
func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
// fmt.Printf("Checkpointing directory: %s\n", dir)
|
||||
// filepath contains the resolution
|
||||
intRes, _ := strconv.Atoi(path.Base(dir))
|
||||
|
||||
// find smallest overall timestamp in l.data map and delete it from l.data
|
||||
minTS := int64(1<<63 - 1)
|
||||
for ts, dat := range l.data {
|
||||
if ts < minTS && len(dat) != 0 {
|
||||
minTS = ts
|
||||
}
|
||||
}
|
||||
|
||||
if from == 0 && minTS != int64(1<<63-1) {
|
||||
from = minTS
|
||||
}
|
||||
|
||||
if from == 0 {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
var schema string
|
||||
var codec *goavro.Codec
|
||||
recordList := make([]map[string]any, 0)
|
||||
|
||||
var f *os.File
|
||||
|
||||
filePath := dir + fmt.Sprintf("_%d.avro", from)
|
||||
|
||||
var err error
|
||||
|
||||
fp_, err_ := os.Stat(filePath)
|
||||
if errors.Is(err_, os.ErrNotExist) {
|
||||
err = os.MkdirAll(path.Dir(dir), 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create directory: %v", err)
|
||||
}
|
||||
} else if fp_.Size() != 0 {
|
||||
f, err = os.Open(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open existing avro file: %v", err)
|
||||
}
|
||||
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
reader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF reader: %v", err)
|
||||
}
|
||||
codec = reader.Codec()
|
||||
schema = codec.Schema()
|
||||
|
||||
f.Close()
|
||||
}
|
||||
|
||||
timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix()
|
||||
|
||||
if dumpAll {
|
||||
timeRef = time.Now().Unix()
|
||||
}
|
||||
|
||||
// Empty values
|
||||
if len(l.data) == 0 {
|
||||
// we checkpoint avro files every 60 seconds
|
||||
repeat := 60 / intRes
|
||||
|
||||
for range repeat {
|
||||
recordList = append(recordList, make(map[string]any))
|
||||
}
|
||||
}
|
||||
|
||||
readFlag := true
|
||||
|
||||
for ts := range l.data {
|
||||
flag := false
|
||||
if ts < timeRef {
|
||||
data := l.data[ts]
|
||||
|
||||
schemaGen, err := generateSchema(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
flag, schema, err = compareSchema(schema, schemaGen)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compare read and generated schema: %v", err)
|
||||
}
|
||||
if flag && readFlag && !errors.Is(err_, os.ErrNotExist) {
|
||||
|
||||
f.Close()
|
||||
|
||||
f, err = os.Open(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open Avro file: %v", err)
|
||||
}
|
||||
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
ocfReader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF reader while changing schema: %v", err)
|
||||
}
|
||||
|
||||
for ocfReader.Scan() {
|
||||
record, err := ocfReader.Read()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read record: %v", err)
|
||||
}
|
||||
|
||||
recordList = append(recordList, record.(map[string]any))
|
||||
}
|
||||
|
||||
f.Close()
|
||||
|
||||
err = os.Remove(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete file: %v", err)
|
||||
}
|
||||
|
||||
readFlag = false
|
||||
}
|
||||
codec, err = goavro.NewCodec(schema)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create codec after merged schema: %v", err)
|
||||
}
|
||||
|
||||
recordList = append(recordList, generateRecord(data))
|
||||
delete(l.data, ts)
|
||||
}
|
||||
}
|
||||
|
||||
if len(recordList) == 0 {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to append new avro file: %v", err)
|
||||
}
|
||||
|
||||
// fmt.Printf("Codec : %#v\n", codec)
|
||||
|
||||
writer, err := goavro.NewOCFWriter(goavro.OCFConfig{
|
||||
W: f,
|
||||
Codec: codec,
|
||||
CompressionName: goavro.CompressionDeflateLabel,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF writer: %v", err)
|
||||
}
|
||||
|
||||
// Append the new record
|
||||
if err := writer.Append(recordList); err != nil {
|
||||
return fmt.Errorf("failed to append record: %v", err)
|
||||
}
|
||||
|
||||
f.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareSchema(schemaRead, schemaGen string) (bool, string, error) {
|
||||
var genSchema, readSchema AvroSchema
|
||||
|
||||
if schemaRead == "" {
|
||||
return false, schemaGen, nil
|
||||
}
|
||||
|
||||
// Unmarshal the schema strings into AvroSchema structs
|
||||
if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil {
|
||||
return false, "", fmt.Errorf("failed to parse generated schema: %v", err)
|
||||
}
|
||||
if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil {
|
||||
return false, "", fmt.Errorf("failed to parse read schema: %v", err)
|
||||
}
|
||||
|
||||
sort.Slice(genSchema.Fields, func(i, j int) bool {
|
||||
return genSchema.Fields[i].Name < genSchema.Fields[j].Name
|
||||
})
|
||||
|
||||
sort.Slice(readSchema.Fields, func(i, j int) bool {
|
||||
return readSchema.Fields[i].Name < readSchema.Fields[j].Name
|
||||
})
|
||||
|
||||
// Check if schemas are identical
|
||||
schemasEqual := true
|
||||
if len(genSchema.Fields) <= len(readSchema.Fields) {
|
||||
|
||||
for i := range genSchema.Fields {
|
||||
if genSchema.Fields[i].Name != readSchema.Fields[i].Name {
|
||||
schemasEqual = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// If schemas are identical, return the read schema
|
||||
if schemasEqual {
|
||||
return false, schemaRead, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Create a map to hold unique fields from both schemas
|
||||
fieldMap := make(map[string]AvroField)
|
||||
|
||||
// Add fields from the read schema
|
||||
for _, field := range readSchema.Fields {
|
||||
fieldMap[field.Name] = field
|
||||
}
|
||||
|
||||
// Add or update fields from the generated schema
|
||||
for _, field := range genSchema.Fields {
|
||||
fieldMap[field.Name] = field
|
||||
}
|
||||
|
||||
// Create a union schema by collecting fields from the map
|
||||
var mergedFields []AvroField
|
||||
for _, field := range fieldMap {
|
||||
mergedFields = append(mergedFields, field)
|
||||
}
|
||||
|
||||
// Sort fields by name for consistency
|
||||
sort.Slice(mergedFields, func(i, j int) bool {
|
||||
return mergedFields[i].Name < mergedFields[j].Name
|
||||
})
|
||||
|
||||
// Create the merged schema
|
||||
mergedSchema := AvroSchema{
|
||||
Type: "record",
|
||||
Name: genSchema.Name,
|
||||
Fields: mergedFields,
|
||||
}
|
||||
|
||||
// Check if schemas are identical
|
||||
schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields)
|
||||
if schemasEqual {
|
||||
for i := range mergedSchema.Fields {
|
||||
if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name {
|
||||
schemasEqual = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if schemasEqual {
|
||||
return false, schemaRead, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Marshal the merged schema back to JSON
|
||||
mergedSchemaJSON, err := json.Marshal(mergedSchema)
|
||||
if err != nil {
|
||||
return false, "", fmt.Errorf("failed to marshal merged schema: %v", err)
|
||||
}
|
||||
|
||||
return true, string(mergedSchemaJSON), nil
|
||||
}
|
||||
|
||||
func generateSchema(data map[string]schema.Float) (string, error) {
|
||||
// Define the Avro schema structure
|
||||
schema := map[string]any{
|
||||
"type": "record",
|
||||
"name": "DataRecord",
|
||||
"fields": []map[string]any{},
|
||||
}
|
||||
|
||||
fieldTracker := make(map[string]struct{})
|
||||
|
||||
for key := range data {
|
||||
if _, exists := fieldTracker[key]; !exists {
|
||||
key = correctKey(key)
|
||||
|
||||
field := map[string]any{
|
||||
"name": key,
|
||||
"type": "double",
|
||||
"default": -1.0,
|
||||
}
|
||||
schema["fields"] = append(schema["fields"].([]map[string]any), field)
|
||||
fieldTracker[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
schemaString, err := json.Marshal(schema)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to marshal schema: %v", err)
|
||||
}
|
||||
|
||||
return string(schemaString), nil
|
||||
}
|
||||
|
||||
func generateRecord(data map[string]schema.Float) map[string]any {
|
||||
record := make(map[string]any)
|
||||
|
||||
// Iterate through each map in data
|
||||
for key, value := range data {
|
||||
key = correctKey(key)
|
||||
|
||||
// Set the value in the record
|
||||
// avro only accepts basic types
|
||||
record[key] = value.Double()
|
||||
}
|
||||
|
||||
return record
|
||||
}
|
||||
|
||||
func correctKey(key string) string {
|
||||
// Replace any invalid characters in the key
|
||||
// For example, replace spaces with underscores
|
||||
key = strings.ReplaceAll(key, ":", "___")
|
||||
key = strings.ReplaceAll(key, ".", "__")
|
||||
|
||||
return key
|
||||
}
|
||||
|
||||
func ReplaceKey(key string) string {
|
||||
// Replace any invalid characters in the key
|
||||
// For example, replace spaces with underscores
|
||||
key = strings.ReplaceAll(key, "___", ":")
|
||||
key = strings.ReplaceAll(key, "__", ".")
|
||||
|
||||
return key
|
||||
}
|
||||
@@ -1,84 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"slices"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
// AvroPool is a pool of Avro writers.
|
||||
go func() {
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
wg.Done() // Mark this goroutine as done
|
||||
return // Exit the goroutine
|
||||
}
|
||||
|
||||
defer wg.Done()
|
||||
|
||||
var avroLevel *AvroLevel
|
||||
oldSelector := make([]string, 0)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case val := <-LineProtocolMessages:
|
||||
// Fetch the frequency of the metric from the global configuration
|
||||
freq, err := GetMetricFrequency(val.MetricName)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error fetching metric frequency: %s\n", err)
|
||||
continue
|
||||
}
|
||||
|
||||
metricName := ""
|
||||
|
||||
for _, selectorName := range val.Selector {
|
||||
metricName += selectorName + Delimiter
|
||||
}
|
||||
|
||||
metricName += val.MetricName
|
||||
|
||||
// Create a new selector for the Avro level
|
||||
// The selector is a slice of strings that represents the path to the
|
||||
// Avro level. It is created by appending the cluster, node, and metric
|
||||
// name to the selector.
|
||||
var selector []string
|
||||
selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
|
||||
|
||||
if !testEq(oldSelector, selector) {
|
||||
// Get the Avro level for the metric
|
||||
avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
|
||||
|
||||
// If the Avro level is nil, create a new one
|
||||
if avroLevel == nil {
|
||||
cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
|
||||
}
|
||||
oldSelector = slices.Clone(selector)
|
||||
}
|
||||
|
||||
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func testEq(a, b []string) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i := range a {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -1,168 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
LineProtocolMessages = make(chan *AvroStruct)
|
||||
Delimiter = "ZZZZZ"
|
||||
)
|
||||
|
||||
// CheckpointBufferMinutes should always be in minutes.
|
||||
// Its controls the amount of data to hold for given amount of time.
|
||||
var CheckpointBufferMinutes = 3
|
||||
|
||||
type AvroStruct struct {
|
||||
MetricName string
|
||||
Cluster string
|
||||
Node string
|
||||
Selector []string
|
||||
Value schema.Float
|
||||
Timestamp int64
|
||||
}
|
||||
|
||||
type AvroStore struct {
|
||||
root AvroLevel
|
||||
}
|
||||
|
||||
var avroStore AvroStore
|
||||
|
||||
type AvroLevel struct {
|
||||
children map[string]*AvroLevel
|
||||
data map[int64]map[string]schema.Float
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
type AvroField struct {
|
||||
Name string `json:"name"`
|
||||
Type any `json:"type"`
|
||||
Default any `json:"default,omitempty"`
|
||||
}
|
||||
|
||||
type AvroSchema struct {
|
||||
Type string `json:"type"`
|
||||
Name string `json:"name"`
|
||||
Fields []AvroField `json:"fields"`
|
||||
}
|
||||
|
||||
func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
// Allow concurrent reads:
|
||||
l.lock.RLock()
|
||||
var child *AvroLevel
|
||||
var ok bool
|
||||
if l.children == nil {
|
||||
// Children map needs to be created...
|
||||
l.lock.RUnlock()
|
||||
} else {
|
||||
child, ok := l.children[selector[0]]
|
||||
l.lock.RUnlock()
|
||||
if ok {
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
}
|
||||
|
||||
// The level does not exist, take write lock for unqiue access:
|
||||
l.lock.Lock()
|
||||
// While this thread waited for the write lock, another thread
|
||||
// could have created the child node.
|
||||
if l.children != nil {
|
||||
child, ok = l.children[selector[0]]
|
||||
if ok {
|
||||
l.lock.Unlock()
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
}
|
||||
|
||||
child = &AvroLevel{
|
||||
data: make(map[int64]map[string]schema.Float, 0),
|
||||
children: nil,
|
||||
}
|
||||
|
||||
if l.children != nil {
|
||||
l.children[selector[0]] = child
|
||||
} else {
|
||||
l.children = map[string]*AvroLevel{selector[0]: child}
|
||||
}
|
||||
l.lock.Unlock()
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
|
||||
func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
KeyCounter := int(CheckpointBufferMinutes * 60 / Freq)
|
||||
|
||||
// Create keys in advance for the given amount of time
|
||||
if len(l.data) != KeyCounter {
|
||||
if len(l.data) == 0 {
|
||||
for i := range KeyCounter {
|
||||
l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0)
|
||||
}
|
||||
} else {
|
||||
// Get the last timestamp
|
||||
var lastTS int64
|
||||
for ts := range l.data {
|
||||
if ts > lastTS {
|
||||
lastTS = ts
|
||||
}
|
||||
}
|
||||
// Create keys for the next KeyCounter timestamps
|
||||
l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0)
|
||||
}
|
||||
}
|
||||
|
||||
closestTS := int64(0)
|
||||
minDiff := int64(Freq) + 1 // Start with diff just outside the valid range
|
||||
found := false
|
||||
|
||||
// Iterate over timestamps and choose the one which is within range.
|
||||
// Since its epoch time, we check if the difference is less than 60 seconds.
|
||||
for ts, dat := range l.data {
|
||||
// Check if timestamp is within range
|
||||
diff := timestamp - ts
|
||||
if diff < -int64(Freq) || diff > int64(Freq) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Metric already present at this timestamp — skip
|
||||
if _, ok := dat[metricName]; ok {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if this is the closest timestamp so far
|
||||
if Abs(diff) < minDiff {
|
||||
minDiff = Abs(diff)
|
||||
closestTS = ts
|
||||
found = true
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
l.data[closestTS][metricName] = value
|
||||
}
|
||||
}
|
||||
|
||||
func GetAvroStore() *AvroStore {
|
||||
return &avroStore
|
||||
}
|
||||
|
||||
// Abs returns the absolute value of x.
|
||||
func Abs(x int64) int64 {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
@@ -1,198 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// Default buffer capacity.
|
||||
// `buffer.data` will only ever grow up to it's capacity and a new link
|
||||
// in the buffer chain will be created if needed so that no copying
|
||||
// of data or reallocation needs to happen on writes.
|
||||
const (
|
||||
BufferCap int = 512
|
||||
)
|
||||
|
||||
// So that we can reuse allocations
|
||||
var bufferPool sync.Pool = sync.Pool{
|
||||
New: func() any {
|
||||
return &buffer{
|
||||
data: make([]schema.Float, 0, BufferCap),
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
var (
|
||||
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
|
||||
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
|
||||
)
|
||||
|
||||
// Each metric on each level has it's own buffer.
|
||||
// This is where the actual values go.
|
||||
// If `cap(data)` is reached, a new buffer is created and
|
||||
// becomes the new head of a buffer list.
|
||||
type buffer struct {
|
||||
prev *buffer
|
||||
next *buffer
|
||||
data []schema.Float
|
||||
frequency int64
|
||||
start int64
|
||||
archived bool
|
||||
closed bool
|
||||
}
|
||||
|
||||
func newBuffer(ts, freq int64) *buffer {
|
||||
b := bufferPool.Get().(*buffer)
|
||||
b.frequency = freq
|
||||
b.start = ts - (freq / 2)
|
||||
b.prev = nil
|
||||
b.next = nil
|
||||
b.archived = false
|
||||
b.closed = false
|
||||
b.data = b.data[:0]
|
||||
return b
|
||||
}
|
||||
|
||||
// If a new buffer was created, the new head is returnd.
|
||||
// Otherwise, the existing buffer is returnd.
|
||||
// Normaly, only "newer" data should be written, but if the value would
|
||||
// end up in the same buffer anyways it is allowed.
|
||||
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
|
||||
if ts < b.start {
|
||||
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
|
||||
}
|
||||
|
||||
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
|
||||
idx := int((ts - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
newbuf := newBuffer(ts, b.frequency)
|
||||
newbuf.prev = b
|
||||
b.next = newbuf
|
||||
b.close()
|
||||
b = newbuf
|
||||
idx = 0
|
||||
}
|
||||
|
||||
// Overwriting value or writing value from past
|
||||
if idx < len(b.data) {
|
||||
b.data[idx] = value
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// Fill up unwritten slots with NaN
|
||||
for i := len(b.data); i < idx; i++ {
|
||||
b.data = append(b.data, schema.NaN)
|
||||
}
|
||||
|
||||
b.data = append(b.data, value)
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (b *buffer) end() int64 {
|
||||
return b.firstWrite() + int64(len(b.data))*b.frequency
|
||||
}
|
||||
|
||||
func (b *buffer) firstWrite() int64 {
|
||||
return b.start + (b.frequency / 2)
|
||||
}
|
||||
|
||||
func (b *buffer) close() {}
|
||||
|
||||
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
|
||||
// Simple linear interpolation is done between the two neighboring cells if possible.
|
||||
// If values at the start or end are missing, instead of NaN values, the second and thrid
|
||||
// return values contain the actual `from`/`to`.
|
||||
// This function goes back the buffer chain if `from` is older than the currents buffer start.
|
||||
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
|
||||
// If `data` is not long enough to hold all values, this function will panic!
|
||||
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
|
||||
if from < b.firstWrite() {
|
||||
if b.prev != nil {
|
||||
return b.prev.read(from, to, data)
|
||||
}
|
||||
from = b.firstWrite()
|
||||
}
|
||||
|
||||
i := 0
|
||||
t := from
|
||||
for ; t < to; t += b.frequency {
|
||||
idx := int((t - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
if b.next == nil {
|
||||
break
|
||||
}
|
||||
b = b.next
|
||||
idx = 0
|
||||
}
|
||||
|
||||
if idx >= len(b.data) {
|
||||
if b.next == nil || to <= b.next.start {
|
||||
break
|
||||
}
|
||||
data[i] += schema.NaN
|
||||
} else if t < b.start {
|
||||
data[i] += schema.NaN
|
||||
// } else if b.data[idx].IsNaN() {
|
||||
// data[i] += interpolate(idx, b.data)
|
||||
} else {
|
||||
data[i] += b.data[idx]
|
||||
}
|
||||
i++
|
||||
}
|
||||
|
||||
return data[:i], from, t, nil
|
||||
}
|
||||
|
||||
// Returns true if this buffer needs to be freed.
|
||||
func (b *buffer) free(t int64) (delme bool, n int) {
|
||||
if b.prev != nil {
|
||||
delme, m := b.prev.free(t)
|
||||
n += m
|
||||
if delme {
|
||||
b.prev.next = nil
|
||||
if cap(b.prev.data) == BufferCap {
|
||||
bufferPool.Put(b.prev)
|
||||
}
|
||||
b.prev = nil
|
||||
}
|
||||
}
|
||||
|
||||
end := b.end()
|
||||
if end < t {
|
||||
return true, n + 1
|
||||
}
|
||||
|
||||
return false, n
|
||||
}
|
||||
|
||||
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
|
||||
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
|
||||
if b == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := b.prev.iterFromTo(from, to, callback); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if from <= b.end() && b.start <= to {
|
||||
return callback(b)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *buffer) count() int64 {
|
||||
res := int64(len(b.data))
|
||||
if b.prev != nil {
|
||||
res += b.prev.count()
|
||||
}
|
||||
return res
|
||||
}
|
||||
@@ -1,783 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/linkedin/goavro/v2"
|
||||
)
|
||||
|
||||
// File operation constants
|
||||
const (
|
||||
// CheckpointFilePerms defines default permissions for checkpoint files
|
||||
CheckpointFilePerms = 0o644
|
||||
// CheckpointDirPerms defines default permissions for checkpoint directories
|
||||
CheckpointDirPerms = 0o755
|
||||
// GCTriggerInterval determines how often GC is forced during checkpoint loading
|
||||
// GC is triggered every GCTriggerInterval*NumWorkers loaded hosts
|
||||
GCTriggerInterval = 100
|
||||
)
|
||||
|
||||
// Whenever changed, update MarshalJSON as well!
|
||||
type CheckpointMetrics struct {
|
||||
Data []schema.Float `json:"data"`
|
||||
Frequency int64 `json:"frequency"`
|
||||
Start int64 `json:"start"`
|
||||
}
|
||||
|
||||
type CheckpointFile struct {
|
||||
Metrics map[string]*CheckpointMetrics `json:"metrics"`
|
||||
Children map[string]*CheckpointFile `json:"children"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
}
|
||||
|
||||
var lastCheckpoint time.Time
|
||||
|
||||
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
|
||||
lastCheckpoint = time.Now()
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
ms := GetMemoryStore()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.Checkpoints.Interval)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", lastCheckpoint.Format(time.RFC3339))
|
||||
now := time.Now()
|
||||
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
|
||||
lastCheckpoint.Unix(), now.Unix())
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n)
|
||||
lastCheckpoint = now
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
} else {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, _ := time.ParseDuration("1m")
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute):
|
||||
// This is the first tick untill we collect the data for given minutes.
|
||||
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
|
||||
// log.Printf("Checkpointing %d avro files", count)
|
||||
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
// Regular ticks of 1 minute to write data.
|
||||
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
|
||||
// log.Printf("Checkpointing %d avro files", count)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// As `Float` implements a custom MarshalJSON() function,
|
||||
// serializing an array of such types has more overhead
|
||||
// than one would assume (because of extra allocations, interfaces and so on).
|
||||
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
|
||||
buf := make([]byte, 0, 128+len(cm.Data)*8)
|
||||
buf = append(buf, `{"frequency":`...)
|
||||
buf = strconv.AppendInt(buf, cm.Frequency, 10)
|
||||
buf = append(buf, `,"start":`...)
|
||||
buf = strconv.AppendInt(buf, cm.Start, 10)
|
||||
buf = append(buf, `,"data":[`...)
|
||||
for i, x := range cm.Data {
|
||||
if i != 0 {
|
||||
buf = append(buf, ',')
|
||||
}
|
||||
if x.IsNaN() {
|
||||
buf = append(buf, `null`...)
|
||||
} else {
|
||||
buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32)
|
||||
}
|
||||
}
|
||||
buf = append(buf, `]}`...)
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
|
||||
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
|
||||
// The good thing: Only a host at a time is locked, so this function can run
|
||||
// in parallel to writes/reads.
|
||||
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
|
||||
levels := make([]*Level, 0)
|
||||
selectors := make([][]string, 0)
|
||||
m.root.lock.RLock()
|
||||
for sel1, l1 := range m.root.children {
|
||||
l1.lock.RLock()
|
||||
for sel2, l2 := range l1.children {
|
||||
levels = append(levels, l2)
|
||||
selectors = append(selectors, []string{sel1, sel2})
|
||||
}
|
||||
l1.lock.RUnlock()
|
||||
}
|
||||
m.root.lock.RUnlock()
|
||||
|
||||
type workItem struct {
|
||||
level *Level
|
||||
dir string
|
||||
selector []string
|
||||
}
|
||||
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(Keys.NumWorkers)
|
||||
work := make(chan workItem, Keys.NumWorkers*2)
|
||||
for worker := 0; worker < Keys.NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
for workItem := range work {
|
||||
if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
|
||||
if err == ErrNoNewArchiveData {
|
||||
continue
|
||||
}
|
||||
|
||||
cclog.Errorf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
} else {
|
||||
atomic.AddInt32(&n, 1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for i := 0; i < len(levels); i++ {
|
||||
dir := path.Join(dir, path.Join(selectors[i]...))
|
||||
work <- workItem{
|
||||
level: levels[i],
|
||||
dir: dir,
|
||||
selector: selectors[i],
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
retval := &CheckpointFile{
|
||||
From: from,
|
||||
To: to,
|
||||
Metrics: make(map[string]*CheckpointMetrics),
|
||||
Children: make(map[string]*CheckpointFile),
|
||||
}
|
||||
|
||||
for metric, minfo := range m.Metrics {
|
||||
b := l.metrics[minfo.offset]
|
||||
if b == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
allArchived := true
|
||||
b.iterFromTo(from, to, func(b *buffer) error {
|
||||
if !b.archived {
|
||||
allArchived = false
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if allArchived {
|
||||
continue
|
||||
}
|
||||
|
||||
data := make([]schema.Float, (to-from)/b.frequency+1)
|
||||
data, start, end, err := b.read(from, to, data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := int((end - start) / b.frequency); i < len(data); i++ {
|
||||
data[i] = schema.NaN
|
||||
}
|
||||
|
||||
retval.Metrics[metric] = &CheckpointMetrics{
|
||||
Frequency: b.frequency,
|
||||
Start: start,
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
for name, child := range l.children {
|
||||
val, err := child.toCheckpointFile(from, to, m)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if val != nil {
|
||||
retval.Children[name] = val
|
||||
}
|
||||
}
|
||||
|
||||
if len(retval.Children) == 0 && len(retval.Metrics) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return retval, nil
|
||||
}
|
||||
|
||||
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
|
||||
cf, err := l.toCheckpointFile(from, to, m)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cf == nil {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
|
||||
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
err = os.MkdirAll(dir, CheckpointDirPerms)
|
||||
if err == nil {
|
||||
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
bw := bufio.NewWriter(f)
|
||||
if err = json.NewEncoder(bw).Encode(cf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return bw.Flush()
|
||||
}
|
||||
|
||||
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
|
||||
var wg sync.WaitGroup
|
||||
work := make(chan [2]string, Keys.NumWorkers)
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
wg.Add(Keys.NumWorkers)
|
||||
for worker := 0; worker < Keys.NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for host := range work {
|
||||
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
|
||||
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> error while loading checkpoints: %s", err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
}
|
||||
atomic.AddInt32(&n, int32(nn))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
i := 0
|
||||
clustersDir, err := os.ReadDir(dir)
|
||||
for _, clusterDir := range clustersDir {
|
||||
if !clusterDir.IsDir() {
|
||||
err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
|
||||
goto done
|
||||
}
|
||||
|
||||
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
|
||||
if e != nil {
|
||||
err = e
|
||||
goto done
|
||||
}
|
||||
|
||||
for _, hostDir := range hostsDir {
|
||||
if !hostDir.IsDir() {
|
||||
err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
|
||||
goto done
|
||||
}
|
||||
|
||||
i++
|
||||
if i%Keys.NumWorkers == 0 && i > GCTriggerInterval {
|
||||
// Forcing garbage collection runs here regulary during the loading of checkpoints
|
||||
// will decrease the total heap size after loading everything back to memory is done.
|
||||
// While loading data, the heap will grow fast, so the GC target size will double
|
||||
// almost always. By forcing GCs here, we can keep it growing more slowly so that
|
||||
// at the end, less memory is wasted.
|
||||
runtime.GC()
|
||||
}
|
||||
|
||||
work <- [2]string{clusterDir.Name(), hostDir.Name()}
|
||||
}
|
||||
}
|
||||
done:
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return int(n), err
|
||||
}
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
|
||||
// This function can only be called once and before the very first write or read.
|
||||
// Different host's data is loaded to memory in parallel.
|
||||
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
|
||||
if _, err := os.Stat(dir); os.IsNotExist(err) {
|
||||
// The directory does not exist, so create it using os.MkdirAll()
|
||||
err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
|
||||
}
|
||||
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
|
||||
}
|
||||
|
||||
// Config read (replace with your actual config read)
|
||||
fileFormat := Keys.Checkpoints.FileFormat
|
||||
if fileFormat == "" {
|
||||
fileFormat = "avro"
|
||||
}
|
||||
|
||||
// Map to easily get the fallback format
|
||||
oppositeFormat := map[string]string{
|
||||
"json": "avro",
|
||||
"avro": "json",
|
||||
}
|
||||
|
||||
// First, attempt to load the specified format
|
||||
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, fileFormat)
|
||||
}
|
||||
|
||||
// If not found, attempt the opposite format
|
||||
altFormat := oppositeFormat[fileFormat]
|
||||
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, altFormat)
|
||||
}
|
||||
|
||||
cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory")
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func checkFilesWithExtension(dir string, extension string) (bool, error) {
|
||||
found := false
|
||||
|
||||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
|
||||
}
|
||||
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
|
||||
found = true
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
|
||||
}
|
||||
|
||||
return found, nil
|
||||
}
|
||||
|
||||
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:]
|
||||
resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err)
|
||||
}
|
||||
|
||||
fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64)
|
||||
|
||||
// Same logic according to lineprotocol
|
||||
fromTimestamp -= (resolution / 2)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err)
|
||||
}
|
||||
|
||||
// fmt.Printf("File : %s with resolution : %d\n", fileName, resolution)
|
||||
|
||||
var recordCounter int64 = 0
|
||||
|
||||
// Create a new OCF reader from the buffered reader
|
||||
ocfReader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error creating OCF reader: %w", err)
|
||||
}
|
||||
|
||||
metricsData := make(map[string]schema.FloatArray)
|
||||
|
||||
for ocfReader.Scan() {
|
||||
datum, err := ocfReader.Read()
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err)
|
||||
}
|
||||
|
||||
record, ok := datum.(map[string]any)
|
||||
if !ok {
|
||||
return fmt.Errorf("[METRICSTORE]> failed to assert datum as map[string]interface{}")
|
||||
}
|
||||
|
||||
for key, value := range record {
|
||||
metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64)))
|
||||
}
|
||||
|
||||
recordCounter += 1
|
||||
}
|
||||
|
||||
to := (fromTimestamp + (recordCounter / (60 / resolution) * 60))
|
||||
if to < from {
|
||||
return nil
|
||||
}
|
||||
|
||||
for key, floatArray := range metricsData {
|
||||
metricName := ReplaceKey(key)
|
||||
|
||||
if strings.Contains(metricName, Delimiter) {
|
||||
subString := strings.Split(metricName, Delimiter)
|
||||
|
||||
lvl := l
|
||||
|
||||
for i := 0; i < len(subString)-1; i++ {
|
||||
|
||||
sel := subString[i]
|
||||
|
||||
if lvl.children == nil {
|
||||
lvl.children = make(map[string]*Level)
|
||||
}
|
||||
|
||||
child, ok := lvl.children[sel]
|
||||
if !ok {
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: nil,
|
||||
}
|
||||
lvl.children[sel] = child
|
||||
}
|
||||
lvl = child
|
||||
}
|
||||
|
||||
leafMetricName := subString[len(subString)-1]
|
||||
err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
|
||||
}
|
||||
} else {
|
||||
err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error {
|
||||
n := len(floatArray)
|
||||
b := &buffer{
|
||||
frequency: resolution,
|
||||
start: from,
|
||||
data: floatArray[0:n:n],
|
||||
prev: nil,
|
||||
next: nil,
|
||||
archived: true,
|
||||
}
|
||||
b.close()
|
||||
|
||||
minfo, ok := m.Metrics[metricName]
|
||||
if !ok {
|
||||
return nil
|
||||
// return errors.New("Unkown metric: " + name)
|
||||
}
|
||||
|
||||
prev := l.metrics[minfo.offset]
|
||||
if prev == nil {
|
||||
l.metrics[minfo.offset] = b
|
||||
} else {
|
||||
if prev.start > b.start {
|
||||
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
|
||||
}
|
||||
|
||||
b.prev = prev
|
||||
prev.next = b
|
||||
|
||||
missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency))
|
||||
if missingCount > 0 {
|
||||
missingCount /= int(b.frequency)
|
||||
|
||||
for range missingCount {
|
||||
prev.data = append(prev.data, schema.NaN)
|
||||
}
|
||||
|
||||
prev.data = prev.data[0:len(prev.data):len(prev.data)]
|
||||
}
|
||||
}
|
||||
l.metrics[minfo.offset] = b
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
|
||||
br := bufio.NewReader(f)
|
||||
cf := &CheckpointFile{}
|
||||
if err := json.NewDecoder(br).Decode(cf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cf.To != 0 && cf.To < from {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := l.loadFile(cf, m); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
|
||||
for name, metric := range cf.Metrics {
|
||||
n := len(metric.Data)
|
||||
b := &buffer{
|
||||
frequency: metric.Frequency,
|
||||
start: metric.Start,
|
||||
data: metric.Data[0:n:n], // Space is wasted here :(
|
||||
prev: nil,
|
||||
next: nil,
|
||||
archived: true,
|
||||
}
|
||||
b.close()
|
||||
|
||||
minfo, ok := m.Metrics[name]
|
||||
if !ok {
|
||||
continue
|
||||
// return errors.New("Unkown metric: " + name)
|
||||
}
|
||||
|
||||
prev := l.metrics[minfo.offset]
|
||||
if prev == nil {
|
||||
l.metrics[minfo.offset] = b
|
||||
} else {
|
||||
if prev.start > b.start {
|
||||
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
|
||||
}
|
||||
|
||||
b.prev = prev
|
||||
prev.next = b
|
||||
}
|
||||
l.metrics[minfo.offset] = b
|
||||
}
|
||||
|
||||
if len(cf.Children) > 0 && l.children == nil {
|
||||
l.children = make(map[string]*Level)
|
||||
}
|
||||
|
||||
for sel, childCf := range cf.Children {
|
||||
child, ok := l.children[sel]
|
||||
if !ok {
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: nil,
|
||||
}
|
||||
l.children[sel] = child
|
||||
}
|
||||
|
||||
if err := child.loadFile(childCf, m); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
|
||||
direntries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return 0, err
|
||||
}
|
||||
|
||||
allFiles := make([]fs.DirEntry, 0)
|
||||
filesLoaded := 0
|
||||
for _, e := range direntries {
|
||||
if e.IsDir() {
|
||||
child := &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: make(map[string]*Level),
|
||||
}
|
||||
|
||||
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
|
||||
filesLoaded += files
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
l.children[e.Name()] = child
|
||||
} else if strings.HasSuffix(e.Name(), "."+extension) {
|
||||
allFiles = append(allFiles, e)
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
files, err := findFiles(allFiles, from, extension, true)
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
|
||||
"json": l.loadJSONFile,
|
||||
"avro": l.loadAvroFile,
|
||||
}
|
||||
|
||||
loader := loaders[extension]
|
||||
|
||||
for _, filename := range files {
|
||||
// Use a closure to ensure file is closed immediately after use
|
||||
err := func() error {
|
||||
f, err := os.Open(path.Join(dir, filename))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return loader(m, f, from)
|
||||
}()
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
filesLoaded += 1
|
||||
}
|
||||
|
||||
return filesLoaded, nil
|
||||
}
|
||||
|
||||
// This will probably get very slow over time!
|
||||
// A solution could be some sort of an index file in which all other files
|
||||
// and the timespan they contain is listed.
|
||||
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
|
||||
nums := map[string]int64{}
|
||||
for _, e := range direntries {
|
||||
if !strings.HasSuffix(e.Name(), "."+extension) {
|
||||
continue
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nums[e.Name()] = ts
|
||||
}
|
||||
|
||||
sort.Slice(direntries, func(i, j int) bool {
|
||||
a, b := direntries[i], direntries[j]
|
||||
return nums[a.Name()] < nums[b.Name()]
|
||||
})
|
||||
|
||||
filenames := make([]string, 0)
|
||||
for i := range direntries {
|
||||
e := direntries[i]
|
||||
ts1 := nums[e.Name()]
|
||||
|
||||
if findMoreRecentFiles && t <= ts1 {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
if i == len(direntries)-1 {
|
||||
continue
|
||||
}
|
||||
|
||||
enext := direntries[i+1]
|
||||
ts2 := nums[enext.Name()]
|
||||
|
||||
if findMoreRecentFiles {
|
||||
if ts1 < t && t < ts2 {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
} else {
|
||||
if ts2 < t {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return filenames, nil
|
||||
}
|
||||
@@ -1,121 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
var InternalCCMSFlag bool = false
|
||||
|
||||
type MetricStoreConfig struct {
|
||||
// Number of concurrent workers for checkpoint and archive operations.
|
||||
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
|
||||
NumWorkers int `json:"num-workers"`
|
||||
Checkpoints struct {
|
||||
FileFormat string `json:"file-format"`
|
||||
Interval string `json:"interval"`
|
||||
RootDir string `json:"directory"`
|
||||
Restore string `json:"restore"`
|
||||
} `json:"checkpoints"`
|
||||
Debug struct {
|
||||
DumpToFile string `json:"dump-to-file"`
|
||||
EnableGops bool `json:"gops"`
|
||||
} `json:"debug"`
|
||||
RetentionInMemory string `json:"retention-in-memory"`
|
||||
Archive struct {
|
||||
Interval string `json:"interval"`
|
||||
RootDir string `json:"directory"`
|
||||
DeleteInstead bool `json:"delete-instead"`
|
||||
} `json:"archive"`
|
||||
Nats []*NatsConfig `json:"nats"`
|
||||
}
|
||||
|
||||
type NatsConfig struct {
|
||||
// Address of the nats server
|
||||
Address string `json:"address"`
|
||||
|
||||
// Username/Password, optional
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
|
||||
// Creds file path
|
||||
Credsfilepath string `json:"creds-file-path"`
|
||||
|
||||
Subscriptions []struct {
|
||||
// Channel name
|
||||
SubscribeTo string `json:"subscribe-to"`
|
||||
|
||||
// Allow lines without a cluster tag, use this as default, optional
|
||||
ClusterTag string `json:"cluster-tag"`
|
||||
} `json:"subscriptions"`
|
||||
}
|
||||
|
||||
var Keys MetricStoreConfig
|
||||
|
||||
// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time!
|
||||
type AggregationStrategy int
|
||||
|
||||
const (
|
||||
NoAggregation AggregationStrategy = iota
|
||||
SumAggregation
|
||||
AvgAggregation
|
||||
)
|
||||
|
||||
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
|
||||
switch str {
|
||||
case "":
|
||||
return NoAggregation, nil
|
||||
case "sum":
|
||||
return SumAggregation, nil
|
||||
case "avg":
|
||||
return AvgAggregation, nil
|
||||
default:
|
||||
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
|
||||
}
|
||||
}
|
||||
|
||||
type MetricConfig struct {
|
||||
// Interval in seconds at which measurements are stored
|
||||
Frequency int64
|
||||
|
||||
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
|
||||
Aggregation AggregationStrategy
|
||||
|
||||
// Private, used internally...
|
||||
offset int
|
||||
}
|
||||
|
||||
var Metrics map[string]MetricConfig
|
||||
|
||||
func GetMetricFrequency(metricName string) (int64, error) {
|
||||
if metric, ok := Metrics[metricName]; ok {
|
||||
return metric.Frequency, nil
|
||||
}
|
||||
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
|
||||
}
|
||||
|
||||
// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency.
|
||||
// use metric.Name to check if the metric already exists.
|
||||
// if not, add it to the Metrics map.
|
||||
func AddMetric(name string, metric MetricConfig) error {
|
||||
if Metrics == nil {
|
||||
Metrics = make(map[string]MetricConfig, 0)
|
||||
}
|
||||
|
||||
if existingMetric, ok := Metrics[name]; ok {
|
||||
if existingMetric.Frequency != metric.Frequency {
|
||||
if existingMetric.Frequency < metric.Frequency {
|
||||
existingMetric.Frequency = metric.Frequency
|
||||
Metrics[name] = existingMetric
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Metrics[name] = metric
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -1,95 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
const configSchema = `{
|
||||
"type": "object",
|
||||
"description": "Configuration specific to built-in metric-store.",
|
||||
"properties": {
|
||||
"checkpoints": {
|
||||
"description": "Configuration for checkpointing the metrics within metric-store",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file-format": {
|
||||
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
|
||||
"type": "string"
|
||||
},
|
||||
"interval": {
|
||||
"description": "Interval at which the metrics should be checkpointed.",
|
||||
"type": "string"
|
||||
},
|
||||
"directory": {
|
||||
"description": "Specify the parent directy in which the checkpointed files should be placed.",
|
||||
"type": "string"
|
||||
},
|
||||
"restore": {
|
||||
"description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"archive": {
|
||||
"description": "Configuration for archiving the already checkpointed files.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"interval": {
|
||||
"description": "Interval at which the checkpointed files should be archived.",
|
||||
"type": "string"
|
||||
},
|
||||
"directory": {
|
||||
"description": "Specify the parent directy in which the archived files should be placed.",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"retention-in-memory": {
|
||||
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
|
||||
"type": "string"
|
||||
},
|
||||
"nats": {
|
||||
"description": "Configuration for accepting published data through NATS.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"address": {
|
||||
"description": "Address of the NATS server.",
|
||||
"type": "string"
|
||||
},
|
||||
"username": {
|
||||
"description": "Optional: If configured with username/password method.",
|
||||
"type": "string"
|
||||
},
|
||||
"password": {
|
||||
"description": "Optional: If configured with username/password method.",
|
||||
"type": "string"
|
||||
},
|
||||
"creds-file-path": {
|
||||
"description": "Optional: If configured with Credential File method. Path to your NATS cred file.",
|
||||
"type": "string"
|
||||
},
|
||||
"subscriptions": {
|
||||
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"subscribe-to": {
|
||||
"description": "Channel name",
|
||||
"type": "string"
|
||||
},
|
||||
"cluster-tag": {
|
||||
"description": "Optional: Allow lines without a cluster tag, use this as default",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
@@ -1,92 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
|
||||
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
|
||||
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
|
||||
const MaxMissingDataPoints int64 = 5
|
||||
|
||||
// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly.
|
||||
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
|
||||
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
|
||||
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
|
||||
const MaxUnhealthyMetrics int64 = 5
|
||||
|
||||
func (b *buffer) healthCheck() int64 {
|
||||
// Check if the buffer is empty
|
||||
if b.data == nil {
|
||||
return 1
|
||||
}
|
||||
|
||||
bufferEnd := b.start + b.frequency*int64(len(b.data))
|
||||
t := time.Now().Unix()
|
||||
|
||||
// Check if the buffer is too old
|
||||
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
|
||||
return 1
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
for _, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.offset]; b != nil {
|
||||
count += b.healthCheck()
|
||||
}
|
||||
}
|
||||
|
||||
for _, lvl := range l.children {
|
||||
c, err := lvl.healthCheck(m, 0)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
count += c
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
|
||||
}
|
||||
|
||||
buf := make([]byte, 0, 25)
|
||||
// buf = append(buf, "{"...)
|
||||
|
||||
var count int64 = 0
|
||||
|
||||
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if unhealthyMetricsCount < MaxUnhealthyMetrics {
|
||||
buf = append(buf, "Healthy"...)
|
||||
} else {
|
||||
buf = append(buf, "Unhealthy"...)
|
||||
}
|
||||
|
||||
// buf = append(buf, "}\n"...)
|
||||
|
||||
if _, err = w.Write(buf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return w.Flush()
|
||||
}
|
||||
@@ -1,192 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
// Could also be called "node" as this forms a node in a tree structure.
|
||||
// Called Level because "node" might be confusing here.
|
||||
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
|
||||
// also hold data (in `metrics`).
|
||||
type Level struct {
|
||||
children map[string]*Level
|
||||
metrics []*buffer
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
// Find the correct level for the given selector, creating it if
|
||||
// it does not exist. Example selector in the context of the
|
||||
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
|
||||
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
|
||||
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
// Allow concurrent reads:
|
||||
l.lock.RLock()
|
||||
var child *Level
|
||||
var ok bool
|
||||
if l.children == nil {
|
||||
// Children map needs to be created...
|
||||
l.lock.RUnlock()
|
||||
} else {
|
||||
child, ok = l.children[selector[0]]
|
||||
l.lock.RUnlock()
|
||||
if ok {
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
// The level does not exist, take write lock for unqiue access:
|
||||
l.lock.Lock()
|
||||
// While this thread waited for the write lock, another thread
|
||||
// could have created the child node.
|
||||
if l.children != nil {
|
||||
child, ok = l.children[selector[0]]
|
||||
if ok {
|
||||
l.lock.Unlock()
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, nMetrics),
|
||||
children: nil,
|
||||
}
|
||||
|
||||
if l.children != nil {
|
||||
l.children[selector[0]] = child
|
||||
} else {
|
||||
l.children = map[string]*Level{selector[0]: child}
|
||||
}
|
||||
l.lock.Unlock()
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
|
||||
func (l *Level) free(t int64) (int, error) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
n := 0
|
||||
for i, b := range l.metrics {
|
||||
if b != nil {
|
||||
delme, m := b.free(t)
|
||||
n += m
|
||||
if delme {
|
||||
if cap(b.data) == BufferCap {
|
||||
bufferPool.Put(b)
|
||||
}
|
||||
l.metrics[i] = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range l.children {
|
||||
m, err := l.free(t)
|
||||
n += m
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (l *Level) sizeInBytes() int64 {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
size := int64(0)
|
||||
|
||||
for _, b := range l.metrics {
|
||||
if b != nil {
|
||||
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
|
||||
}
|
||||
}
|
||||
|
||||
for _, child := range l.children {
|
||||
size += child.sizeInBytes()
|
||||
}
|
||||
|
||||
return size
|
||||
}
|
||||
|
||||
func (l *Level) findLevel(selector []string) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
lvl := l.children[selector[0]]
|
||||
if lvl == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return lvl.findLevel(selector[1:])
|
||||
}
|
||||
|
||||
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
if len(selector) == 0 {
|
||||
b := l.metrics[offset]
|
||||
if b != nil {
|
||||
return f(b)
|
||||
}
|
||||
|
||||
for _, lvl := range l.children {
|
||||
err := lvl.findBuffers(nil, offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
sel := selector[0]
|
||||
if len(sel.String) != 0 && l.children != nil {
|
||||
lvl, ok := l.children[sel.String]
|
||||
if ok {
|
||||
err := lvl.findBuffers(selector[1:], offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if sel.Group != nil && l.children != nil {
|
||||
for _, key := range sel.Group {
|
||||
lvl, ok := l.children[key]
|
||||
if ok {
|
||||
err := lvl.findBuffers(selector[1:], offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if sel.Any && l.children != nil {
|
||||
for _, lvl := range l.children {
|
||||
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -1,351 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/influxdata/line-protocol/v2/lineprotocol"
|
||||
"github.com/nats-io/nats.go"
|
||||
)
|
||||
|
||||
// Each connection is handled in it's own goroutine. This is a blocking function.
|
||||
// func ReceiveRaw(ctx context.Context,
|
||||
// listener net.Listener,
|
||||
// handleLine func(*lineprotocol.Decoder, string) error,
|
||||
// ) error {
|
||||
// var wg sync.WaitGroup
|
||||
|
||||
// wg.Add(1)
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
// <-ctx.Done()
|
||||
// if err := listener.Close(); err != nil {
|
||||
// log.Printf("listener.Close(): %s", err.Error())
|
||||
// }
|
||||
// }()
|
||||
|
||||
// for {
|
||||
// conn, err := listener.Accept()
|
||||
// if err != nil {
|
||||
// if errors.Is(err, net.ErrClosed) {
|
||||
// break
|
||||
// }
|
||||
|
||||
// log.Printf("listener.Accept(): %s", err.Error())
|
||||
// }
|
||||
|
||||
// wg.Add(2)
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
// defer conn.Close()
|
||||
|
||||
// dec := lineprotocol.NewDecoder(conn)
|
||||
// connctx, cancel := context.WithCancel(context.Background())
|
||||
// defer cancel()
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
// select {
|
||||
// case <-connctx.Done():
|
||||
// conn.Close()
|
||||
// case <-ctx.Done():
|
||||
// conn.Close()
|
||||
// }
|
||||
// }()
|
||||
|
||||
// if err := handleLine(dec, "default"); err != nil {
|
||||
// if errors.Is(err, net.ErrClosed) {
|
||||
// return
|
||||
// }
|
||||
|
||||
// log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error())
|
||||
// errmsg := make([]byte, 128)
|
||||
// errmsg = append(errmsg, `error: `...)
|
||||
// errmsg = append(errmsg, err.Error()...)
|
||||
// errmsg = append(errmsg, '\n')
|
||||
// conn.Write(errmsg)
|
||||
// }
|
||||
// }()
|
||||
// }
|
||||
|
||||
// wg.Wait()
|
||||
// return nil
|
||||
// }
|
||||
|
||||
// ReceiveNats connects to a nats server and subscribes to "updates". This is a
|
||||
// blocking function. handleLine will be called for each line recieved via
|
||||
// nats. Send `true` through the done channel for gracefull termination.
|
||||
func ReceiveNats(conf *(NatsConfig),
|
||||
ms *MemoryStore,
|
||||
workers int,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
var opts []nats.Option
|
||||
if conf.Username != "" && conf.Password != "" {
|
||||
opts = append(opts, nats.UserInfo(conf.Username, conf.Password))
|
||||
}
|
||||
|
||||
if conf.Credsfilepath != "" {
|
||||
opts = append(opts, nats.UserCredentials(conf.Credsfilepath))
|
||||
}
|
||||
|
||||
nc, err := nats.Connect(conf.Address, opts...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer nc.Close()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
var subs []*nats.Subscription
|
||||
|
||||
msgs := make(chan *nats.Msg, workers*2)
|
||||
|
||||
for _, sc := range conf.Subscriptions {
|
||||
clusterTag := sc.ClusterTag
|
||||
var sub *nats.Subscription
|
||||
if workers > 1 {
|
||||
wg.Add(workers)
|
||||
|
||||
for range workers {
|
||||
go func() {
|
||||
for m := range msgs {
|
||||
dec := lineprotocol.NewDecoderWithBytes(m.Data)
|
||||
if err := DecodeLine(dec, ms, clusterTag); err != nil {
|
||||
cclog.Errorf("error: %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
|
||||
sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
|
||||
msgs <- m
|
||||
})
|
||||
} else {
|
||||
sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
|
||||
dec := lineprotocol.NewDecoderWithBytes(m.Data)
|
||||
if err := DecodeLine(dec, ms, clusterTag); err != nil {
|
||||
cclog.Errorf("error: %s", err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cclog.Infof("NATS subscription to '%s' on '%s' established", sc.SubscribeTo, conf.Address)
|
||||
subs = append(subs, sub)
|
||||
}
|
||||
|
||||
<-ctx.Done()
|
||||
for _, sub := range subs {
|
||||
err = sub.Unsubscribe()
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS unsubscribe failed: %s", err.Error())
|
||||
}
|
||||
}
|
||||
close(msgs)
|
||||
wg.Wait()
|
||||
|
||||
nc.Close()
|
||||
cclog.Print("NATS connection closed")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Place `prefix` in front of `buf` but if possible,
|
||||
// do that inplace in `buf`.
|
||||
func reorder(buf, prefix []byte) []byte {
|
||||
n := len(prefix)
|
||||
m := len(buf)
|
||||
if cap(buf) < m+n {
|
||||
return append(prefix[:n:n], buf...)
|
||||
} else {
|
||||
buf = buf[:n+m]
|
||||
for i := m - 1; i >= 0; i-- {
|
||||
buf[i+n] = buf[i]
|
||||
}
|
||||
for i := range n {
|
||||
buf[i] = prefix[i]
|
||||
}
|
||||
return buf
|
||||
}
|
||||
}
|
||||
|
||||
// Decode lines using dec and make write calls to the MemoryStore.
|
||||
// If a line is missing its cluster tag, use clusterDefault as default.
|
||||
func DecodeLine(dec *lineprotocol.Decoder,
|
||||
ms *MemoryStore,
|
||||
clusterDefault string,
|
||||
) error {
|
||||
// Reduce allocations in loop:
|
||||
t := time.Now()
|
||||
metric, metricBuf := Metric{}, make([]byte, 0, 16)
|
||||
selector := make([]string, 0, 4)
|
||||
typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0)
|
||||
|
||||
// Optimize for the case where all lines in a "batch" are about the same
|
||||
// cluster and host. By using `WriteToLevel` (level = host), we do not need
|
||||
// to take the root- and cluster-level lock as often.
|
||||
var lvl *Level = nil
|
||||
prevCluster, prevHost := "", ""
|
||||
|
||||
var ok bool
|
||||
for dec.Next() {
|
||||
rawmeasurement, err := dec.Measurement()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Needs to be copied because another call to dec.* would
|
||||
// invalidate the returned slice.
|
||||
metricBuf = append(metricBuf[:0], rawmeasurement...)
|
||||
|
||||
// The go compiler optimizes map[string(byteslice)] lookups:
|
||||
metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0]
|
||||
cluster, host := clusterDefault, ""
|
||||
for {
|
||||
key, val, err := dec.NextTag()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if key == nil {
|
||||
break
|
||||
}
|
||||
|
||||
// The go compiler optimizes string([]byte{...}) == "...":
|
||||
switch string(key) {
|
||||
case "cluster":
|
||||
if string(val) == prevCluster {
|
||||
cluster = prevCluster
|
||||
} else {
|
||||
cluster = string(val)
|
||||
lvl = nil
|
||||
}
|
||||
case "hostname", "host":
|
||||
if string(val) == prevHost {
|
||||
host = prevHost
|
||||
} else {
|
||||
host = string(val)
|
||||
lvl = nil
|
||||
}
|
||||
case "type":
|
||||
if string(val) == "node" {
|
||||
break
|
||||
}
|
||||
|
||||
// We cannot be sure that the "type" tag comes before the "type-id" tag:
|
||||
if len(typeBuf) == 0 {
|
||||
typeBuf = append(typeBuf, val...)
|
||||
} else {
|
||||
typeBuf = reorder(typeBuf, val)
|
||||
}
|
||||
case "type-id":
|
||||
typeBuf = append(typeBuf, val...)
|
||||
case "subtype":
|
||||
// We cannot be sure that the "subtype" tag comes before the "stype-id" tag:
|
||||
if len(subTypeBuf) == 0 {
|
||||
subTypeBuf = append(subTypeBuf, val...)
|
||||
} else {
|
||||
subTypeBuf = reorder(subTypeBuf, val)
|
||||
// subTypeBuf = reorder(typeBuf, val)
|
||||
}
|
||||
case "stype-id":
|
||||
subTypeBuf = append(subTypeBuf, val...)
|
||||
default:
|
||||
// Ignore unkown tags (cc-metric-collector might send us a unit for example that we do not need)
|
||||
// return fmt.Errorf("unkown tag: '%s' (value: '%s')", string(key), string(val))
|
||||
}
|
||||
}
|
||||
|
||||
// If the cluster or host changed, the lvl was set to nil
|
||||
if lvl == nil {
|
||||
selector = selector[:2]
|
||||
selector[0], selector[1] = cluster, host
|
||||
lvl = ms.GetLevel(selector)
|
||||
prevCluster, prevHost = cluster, host
|
||||
}
|
||||
|
||||
// subtypes:
|
||||
selector = selector[:0]
|
||||
if len(typeBuf) > 0 {
|
||||
selector = append(selector, string(typeBuf)) // <- Allocation :(
|
||||
if len(subTypeBuf) > 0 {
|
||||
selector = append(selector, string(subTypeBuf))
|
||||
}
|
||||
}
|
||||
|
||||
for {
|
||||
key, val, err := dec.NextField()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if key == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if string(key) != "value" {
|
||||
return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val)
|
||||
}
|
||||
|
||||
if val.Kind() == lineprotocol.Float {
|
||||
metric.Value = schema.Float(val.FloatV())
|
||||
} else if val.Kind() == lineprotocol.Int {
|
||||
metric.Value = schema.Float(val.IntV())
|
||||
} else if val.Kind() == lineprotocol.Uint {
|
||||
metric.Value = schema.Float(val.UintV())
|
||||
} else {
|
||||
return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String())
|
||||
}
|
||||
}
|
||||
|
||||
if t, err = dec.Time(lineprotocol.Second, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil {
|
||||
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
|
||||
}
|
||||
|
||||
time := t.Unix()
|
||||
|
||||
if Keys.Checkpoints.FileFormat != "json" {
|
||||
LineProtocolMessages <- &AvroStruct{
|
||||
MetricName: string(metricBuf),
|
||||
Cluster: cluster,
|
||||
Node: host,
|
||||
Selector: append([]string{}, selector...),
|
||||
Value: metric.Value,
|
||||
Timestamp: time,
|
||||
}
|
||||
}
|
||||
|
||||
if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1,437 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package memorystore provides an efficient in-memory time-series metric storage system
|
||||
// with support for hierarchical data organization, checkpointing, and archiving.
|
||||
//
|
||||
// The package organizes metrics in a tree structure (cluster → host → component) and
|
||||
// provides concurrent read/write access to metric data with configurable aggregation strategies.
|
||||
// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
|
||||
// and enforcing retention policies.
|
||||
//
|
||||
// Key features:
|
||||
// - In-memory metric storage with configurable retention
|
||||
// - Hierarchical data organization (selectors)
|
||||
// - Concurrent checkpoint/archive workers
|
||||
// - Support for sum and average aggregation
|
||||
// - NATS integration for metric ingestion
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
var (
|
||||
singleton sync.Once
|
||||
msInstance *MemoryStore
|
||||
// shutdownFunc stores the context cancellation function created in Init
|
||||
// and is called during Shutdown to cancel all background goroutines
|
||||
shutdownFunc context.CancelFunc
|
||||
)
|
||||
|
||||
|
||||
|
||||
type Metric struct {
|
||||
Name string
|
||||
Value schema.Float
|
||||
MetricConfig MetricConfig
|
||||
}
|
||||
|
||||
type MemoryStore struct {
|
||||
Metrics map[string]MetricConfig
|
||||
root Level
|
||||
}
|
||||
|
||||
func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
|
||||
startupTime := time.Now()
|
||||
|
||||
if rawConfig != nil {
|
||||
config.Validate(configSchema, rawConfig)
|
||||
dec := json.NewDecoder(bytes.NewReader(rawConfig))
|
||||
// dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
// Set NumWorkers from config or use default
|
||||
if Keys.NumWorkers <= 0 {
|
||||
maxWorkers := 10
|
||||
Keys.NumWorkers = min(runtime.NumCPU()/2+1, maxWorkers)
|
||||
}
|
||||
cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers)
|
||||
|
||||
// Helper function to add metric configuration
|
||||
addMetricConfig := func(mc schema.MetricConfig) {
|
||||
agg, err := AssignAggregationStrategy(mc.Aggregation)
|
||||
if err != nil {
|
||||
cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error())
|
||||
}
|
||||
|
||||
AddMetric(mc.Name, MetricConfig{
|
||||
Frequency: int64(mc.Timestep),
|
||||
Aggregation: agg,
|
||||
})
|
||||
}
|
||||
|
||||
for _, c := range archive.Clusters {
|
||||
for _, mc := range c.MetricConfig {
|
||||
addMetricConfig(*mc)
|
||||
}
|
||||
|
||||
for _, sc := range c.SubClusters {
|
||||
for _, mc := range sc.MetricConfig {
|
||||
addMetricConfig(mc)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass the config.MetricStoreKeys
|
||||
InitMetrics(Metrics)
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
d, err := time.ParseDuration(Keys.Checkpoints.Restore)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
|
||||
restoreFrom := startupTime.Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
|
||||
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
|
||||
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
|
||||
}
|
||||
|
||||
// Try to use less memory by forcing a GC run here and then
|
||||
// lowering the target percentage. The default of 100 means
|
||||
// that only once the ratio of new allocations execeds the
|
||||
// previously active heap, a GC is triggered.
|
||||
// Forcing a GC here will set the "previously active heap"
|
||||
// to a minumum.
|
||||
runtime.GC()
|
||||
|
||||
ctx, shutdown := context.WithCancel(context.Background())
|
||||
|
||||
wg.Add(4)
|
||||
|
||||
Retention(wg, ctx)
|
||||
Checkpointing(wg, ctx)
|
||||
Archiving(wg, ctx)
|
||||
DataStaging(wg, ctx)
|
||||
|
||||
// Note: Signal handling has been removed from this function.
|
||||
// The caller is responsible for handling shutdown signals and calling
|
||||
// the shutdown() function when appropriate.
|
||||
// Store the shutdown function for later use by Shutdown()
|
||||
shutdownFunc = shutdown
|
||||
|
||||
if Keys.Nats != nil {
|
||||
for _, natsConf := range Keys.Nats {
|
||||
// TODO: When multiple nats configs share a URL, do a single connect.
|
||||
wg.Add(1)
|
||||
nc := natsConf
|
||||
go func() {
|
||||
// err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx)
|
||||
err := ReceiveNats(nc, ms, 1, ctx)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// InitMetrics creates a new, initialized instance of a MemoryStore.
|
||||
// Will panic if values in the metric configurations are invalid.
|
||||
func InitMetrics(metrics map[string]MetricConfig) {
|
||||
singleton.Do(func() {
|
||||
offset := 0
|
||||
for key, cfg := range metrics {
|
||||
if cfg.Frequency == 0 {
|
||||
panic("[METRICSTORE]> invalid frequency")
|
||||
}
|
||||
|
||||
metrics[key] = MetricConfig{
|
||||
Frequency: cfg.Frequency,
|
||||
Aggregation: cfg.Aggregation,
|
||||
offset: offset,
|
||||
}
|
||||
offset += 1
|
||||
}
|
||||
|
||||
msInstance = &MemoryStore{
|
||||
root: Level{
|
||||
metrics: make([]*buffer, len(metrics)),
|
||||
children: make(map[string]*Level),
|
||||
},
|
||||
Metrics: metrics,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func GetMemoryStore() *MemoryStore {
|
||||
if msInstance == nil {
|
||||
cclog.Fatalf("[METRICSTORE]> MemoryStore not initialized!")
|
||||
}
|
||||
|
||||
return msInstance
|
||||
}
|
||||
|
||||
func Shutdown() {
|
||||
// Cancel the context to signal all background goroutines to stop
|
||||
if shutdownFunc != nil {
|
||||
shutdownFunc()
|
||||
}
|
||||
|
||||
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
|
||||
var files int
|
||||
var err error
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
|
||||
} else {
|
||||
files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
|
||||
close(LineProtocolMessages)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
|
||||
}
|
||||
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
|
||||
}
|
||||
|
||||
func getName(m *MemoryStore, i int) string {
|
||||
for key, val := range m.Metrics {
|
||||
if val.offset == i {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func Retention(wg *sync.WaitGroup, ctx context.Context) {
|
||||
ms := GetMemoryStore()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.RetentionInMemory)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
d := d / 2
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
t := time.Now().Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
|
||||
freed, err := ms.Free(nil, t.Unix())
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
|
||||
// Look at `findLevelOrCreate` for how selectors work.
|
||||
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
|
||||
var ok bool
|
||||
for i, metric := range metrics {
|
||||
if metric.MetricConfig.Frequency == 0 {
|
||||
metric.MetricConfig, ok = m.Metrics[metric.Name]
|
||||
if !ok {
|
||||
metric.MetricConfig.Frequency = 0
|
||||
}
|
||||
metrics[i] = metric
|
||||
}
|
||||
}
|
||||
|
||||
return m.WriteToLevel(&m.root, selector, ts, metrics)
|
||||
}
|
||||
|
||||
func (m *MemoryStore) GetLevel(selector []string) *Level {
|
||||
return m.root.findLevelOrCreate(selector, len(m.Metrics))
|
||||
}
|
||||
|
||||
// WriteToLevel assumes that `minfo` in `metrics` is filled in
|
||||
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
|
||||
l = l.findLevelOrCreate(selector, len(m.Metrics))
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
for _, metric := range metrics {
|
||||
if metric.MetricConfig.Frequency == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
b := l.metrics[metric.MetricConfig.offset]
|
||||
if b == nil {
|
||||
// First write to this metric and level
|
||||
b = newBuffer(ts, metric.MetricConfig.Frequency)
|
||||
l.metrics[metric.MetricConfig.offset] = b
|
||||
}
|
||||
|
||||
nb, err := b.write(ts, metric.Value)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Last write created a new buffer...
|
||||
if b != nb {
|
||||
l.metrics[metric.MetricConfig.offset] = nb
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
|
||||
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
|
||||
// The second and third return value are the actual from/to for the data. Those can be different from
|
||||
// the range asked for if no data was available.
|
||||
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
|
||||
if from > to {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
|
||||
}
|
||||
|
||||
minfo, ok := m.Metrics[metric]
|
||||
if !ok {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: " + metric)
|
||||
}
|
||||
|
||||
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
|
||||
|
||||
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
|
||||
cdata, cfrom, cto, err := b.read(from, to, data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
from, to = cfrom, cto
|
||||
} else if from != cfrom || to != cto || len(data) != len(cdata) {
|
||||
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
|
||||
if missingfront != 0 {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
newlen := len(cdata) - missingback
|
||||
if newlen < 1 {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
cdata = cdata[0:newlen]
|
||||
if len(cdata) != len(data) {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
from, to = cfrom, cto
|
||||
}
|
||||
|
||||
data = cdata
|
||||
n += 1
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, err
|
||||
} else if n == 0 {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
|
||||
} else if n > 1 {
|
||||
if minfo.Aggregation == AvgAggregation {
|
||||
normalize := 1. / schema.Float(n)
|
||||
for i := 0; i < len(data); i++ {
|
||||
data[i] *= normalize
|
||||
}
|
||||
} else if minfo.Aggregation != SumAggregation {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
|
||||
}
|
||||
}
|
||||
|
||||
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, err
|
||||
}
|
||||
|
||||
return data, from, to, resolution, nil
|
||||
}
|
||||
|
||||
// Free releases all buffers for the selected level and all its children that
|
||||
// contain only values older than `t`.
|
||||
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
|
||||
return m.GetLevel(selector).free(t)
|
||||
}
|
||||
|
||||
func (m *MemoryStore) FreeAll() error {
|
||||
for k := range m.root.children {
|
||||
delete(m.root.children, k)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) SizeInBytes() int64 {
|
||||
return m.root.sizeInBytes()
|
||||
}
|
||||
|
||||
// ListChildren , given a selector, returns a list of all children of the level
|
||||
// selected.
|
||||
func (m *MemoryStore) ListChildren(selector []string) []string {
|
||||
lvl := &m.root
|
||||
for lvl != nil && len(selector) != 0 {
|
||||
lvl.lock.RLock()
|
||||
next := lvl.children[selector[0]]
|
||||
lvl.lock.RUnlock()
|
||||
lvl = next
|
||||
selector = selector[1:]
|
||||
}
|
||||
|
||||
if lvl == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
lvl.lock.RLock()
|
||||
defer lvl.lock.RUnlock()
|
||||
|
||||
children := make([]string, 0, len(lvl.children))
|
||||
for child := range lvl.children {
|
||||
children = append(children, child)
|
||||
}
|
||||
|
||||
return children
|
||||
}
|
||||
@@ -1,156 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
func TestAssignAggregationStrategy(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected AggregationStrategy
|
||||
wantErr bool
|
||||
}{
|
||||
{"empty string", "", NoAggregation, false},
|
||||
{"sum", "sum", SumAggregation, false},
|
||||
{"avg", "avg", AvgAggregation, false},
|
||||
{"invalid", "invalid", NoAggregation, true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := AssignAggregationStrategy(tt.input)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("AssignAggregationStrategy(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if result != tt.expected {
|
||||
t.Errorf("AssignAggregationStrategy(%q) = %v, want %v", tt.input, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAddMetric(t *testing.T) {
|
||||
// Reset Metrics before test
|
||||
Metrics = make(map[string]MetricConfig)
|
||||
|
||||
err := AddMetric("test_metric", MetricConfig{
|
||||
Frequency: 60,
|
||||
Aggregation: SumAggregation,
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("AddMetric() error = %v", err)
|
||||
}
|
||||
|
||||
if _, ok := Metrics["test_metric"]; !ok {
|
||||
t.Error("AddMetric() did not add metric to Metrics map")
|
||||
}
|
||||
|
||||
// Test updating with higher frequency
|
||||
err = AddMetric("test_metric", MetricConfig{
|
||||
Frequency: 120,
|
||||
Aggregation: SumAggregation,
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("AddMetric() error = %v", err)
|
||||
}
|
||||
|
||||
if Metrics["test_metric"].Frequency != 120 {
|
||||
t.Errorf("AddMetric() frequency = %d, want 120", Metrics["test_metric"].Frequency)
|
||||
}
|
||||
|
||||
// Test updating with lower frequency (should not update)
|
||||
err = AddMetric("test_metric", MetricConfig{
|
||||
Frequency: 30,
|
||||
Aggregation: SumAggregation,
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("AddMetric() error = %v", err)
|
||||
}
|
||||
|
||||
if Metrics["test_metric"].Frequency != 120 {
|
||||
t.Errorf("AddMetric() frequency = %d, want 120 (should not downgrade)", Metrics["test_metric"].Frequency)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMetricFrequency(t *testing.T) {
|
||||
// Reset Metrics before test
|
||||
Metrics = map[string]MetricConfig{
|
||||
"test_metric": {
|
||||
Frequency: 60,
|
||||
Aggregation: SumAggregation,
|
||||
},
|
||||
}
|
||||
|
||||
freq, err := GetMetricFrequency("test_metric")
|
||||
if err != nil {
|
||||
t.Errorf("GetMetricFrequency() error = %v", err)
|
||||
}
|
||||
if freq != 60 {
|
||||
t.Errorf("GetMetricFrequency() = %d, want 60", freq)
|
||||
}
|
||||
|
||||
_, err = GetMetricFrequency("nonexistent")
|
||||
if err == nil {
|
||||
t.Error("GetMetricFrequency() expected error for nonexistent metric")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBufferWrite(t *testing.T) {
|
||||
b := newBuffer(100, 10)
|
||||
|
||||
// Test writing value
|
||||
nb, err := b.write(100, schema.Float(42.0))
|
||||
if err != nil {
|
||||
t.Errorf("buffer.write() error = %v", err)
|
||||
}
|
||||
if nb != b {
|
||||
t.Error("buffer.write() created new buffer unexpectedly")
|
||||
}
|
||||
if len(b.data) != 1 {
|
||||
t.Errorf("buffer.write() len(data) = %d, want 1", len(b.data))
|
||||
}
|
||||
if b.data[0] != schema.Float(42.0) {
|
||||
t.Errorf("buffer.write() data[0] = %v, want 42.0", b.data[0])
|
||||
}
|
||||
|
||||
// Test writing value from past (should error)
|
||||
_, err = b.write(50, schema.Float(10.0))
|
||||
if err == nil {
|
||||
t.Error("buffer.write() expected error for past timestamp")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBufferRead(t *testing.T) {
|
||||
b := newBuffer(100, 10)
|
||||
|
||||
// Write some test data
|
||||
b.write(100, schema.Float(1.0))
|
||||
b.write(110, schema.Float(2.0))
|
||||
b.write(120, schema.Float(3.0))
|
||||
|
||||
// Read data
|
||||
data := make([]schema.Float, 3)
|
||||
result, from, to, err := b.read(100, 130, data)
|
||||
if err != nil {
|
||||
t.Errorf("buffer.read() error = %v", err)
|
||||
}
|
||||
// Buffer read should return from as firstWrite (start + freq/2)
|
||||
if from != 100 {
|
||||
t.Errorf("buffer.read() from = %d, want 100", from)
|
||||
}
|
||||
if to != 130 {
|
||||
t.Errorf("buffer.read() to = %d, want 130", to)
|
||||
}
|
||||
if len(result) != 3 {
|
||||
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
|
||||
}
|
||||
}
|
||||
@@ -1,381 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package metricDataDispatcher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/lrucache"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
func cacheKey(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) string {
|
||||
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
|
||||
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
|
||||
job.ID, job.State, metrics, scopes, resolution)
|
||||
}
|
||||
|
||||
// Fetches the metric data for a job.
|
||||
func LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
||||
config.Keys.DisableArchive {
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
|
||||
}
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := archive.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = repo.LoadData(job, metrics, scopes, ctx, resolution)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
cclog.Warnf("partial error: %s", err.Error())
|
||||
// return err, 0, 0 // Reactivating will block archiving on one partial error
|
||||
} else {
|
||||
cclog.Error("Error while loading job data from metric repository")
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
var jd_temp schema.JobData
|
||||
jd_temp, err = archive.GetHandle().LoadJobData(job)
|
||||
if err != nil {
|
||||
cclog.Error("Error while loading job data from archive")
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
// Deep copy the cached archive hashmap
|
||||
jd = metricdata.DeepCopy(jd_temp)
|
||||
|
||||
// Resampling for archived data.
|
||||
// Pass the resolution from frontend here.
|
||||
for _, v := range jd {
|
||||
for _, v_ := range v {
|
||||
timestep := int64(0)
|
||||
for i := 0; i < len(v_.Series); i += 1 {
|
||||
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
|
||||
if err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
v_.Timestep = int(timestep)
|
||||
}
|
||||
}
|
||||
|
||||
// Avoid sending unrequested data to the client:
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = jd.Size()
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
// FIXME: Review: Is this really necessary or correct.
|
||||
// Note: Lines 147-170 formerly known as prepareJobData(jobData, scopes)
|
||||
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
|
||||
// to be available at the scope 'node'. If a job has a lot of nodes,
|
||||
// statisticsSeries should be available so that a min/median/max Graph can be
|
||||
// used instead of a lot of single lines.
|
||||
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
|
||||
// Existing (archived) StatsSeries can be 'min/mean/max'!
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jd.AddNodeScope("flops_any")
|
||||
jd.AddNodeScope("mem_bw")
|
||||
}
|
||||
|
||||
// Round Resulting Stat Values
|
||||
jd.RoundMetricStats()
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
cclog.Error("Error in returned dataset")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
||||
func LoadAverages(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
data [][]schema.Float,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
|
||||
}
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Used for statsTable in frontend: Return scoped statistics by metric.
|
||||
func LoadScopedJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
|
||||
}
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
|
||||
}
|
||||
|
||||
scopedStats, err := repo.LoadScopedStats(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while loading scoped statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return scopedStats, nil
|
||||
}
|
||||
|
||||
// Used for polar plots in frontend: Aggregates statistics for all nodes to single values for job per metric.
|
||||
func LoadJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.MetricStatistics, error) {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadStatsFromArchive(job, metrics)
|
||||
}
|
||||
|
||||
data := make(map[string]schema.MetricStatistics, len(metrics))
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return data, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while loading statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
|
||||
return data, err
|
||||
}
|
||||
|
||||
for _, m := range metrics {
|
||||
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
|
||||
continue
|
||||
}
|
||||
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
min = math.Min(min, node.Min)
|
||||
max = math.Max(max, node.Max)
|
||||
}
|
||||
|
||||
data[m] = schema.MetricStatistics{
|
||||
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
|
||||
Min: (math.Round(min*100) / 100),
|
||||
Max: (math.Round(max*100) / 100),
|
||||
}
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Used for the classic node/system view. Returns a map of nodes to a map of metrics.
|
||||
func LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
repo, err := metricdata.GetMetricDataRepo(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
cclog.Error("Error while loading node data from metric repository")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func LoadNodeListData(
|
||||
cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, error) {
|
||||
repo, err := metricdata.GetMetricDataRepo(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
cclog.Error("Error while loading node data from metric repository")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
|
||||
const maxSeriesSize int = 8
|
||||
for _, jd := range data {
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user